GenAI-Security-Project · eaglei15 · May 11, 2026 · Mar 1, 2026 · Mar 1, 2026 · Mar 2, 2026
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -80,7 +80,7 @@ import json
 import logging
 
 # Third-party
-import requests
+from cyclonedx.model.bom import Bom
 from huggingface_hub import HfApi
 
 # Local imports
@@ -124,7 +124,6 @@ aibom-generator/
 │   │   ├── validation.py     # CycloneDX 1.6 schema validation
 │   │   ├── license_utils.py  # License normalization
 │   │   └── analytics.py      # Usage tracking
-│   ├── schemas/              # JSON schemas (CycloneDX, SPDX)
 │   └── templates/            # HTML templates
 ├── tests/                    # Unit and integration tests
 └── requirements.txt
@@ -134,7 +133,8 @@ aibom-generator/
 
 - **Service-oriented architecture**: Core logic lives in `models/service.py`
 - **Registry-driven fields**: Field definitions from `models/registry.py`
-- **CycloneDX 1.6 compliance**: All AIBOMs validate against the schema
+- **CycloneDX Python Library**: BOM serialization, schema validation, and SPDX license handling delegate to [`cyclonedx-python-lib`](https://github.com/CycloneDX/cyclonedx-python-lib) — avoid manual JSON construction for anything the library supports
+- **CycloneDX 1.6 compliance**: All AIBOMs validate against the schema via the library's built-in `JsonValidator`
 - **Completeness scoring**: Quality metrics in `models/scoring.py`
 
 ## Running Tests

diff --git a/README.md b/README.md
@@ -21,6 +21,8 @@ The tool is also listed in the official **[CycloneDX Tool Center](https://cyclon
 - Calculates **AIBOM completeness scoring** with recommendations  
 - Supports metadata extraction from model cards, configurations, and repository files  
 
+> **Built on [CycloneDX Python Library](https://github.com/CycloneDX/cyclonedx-python-lib)** — BOM generation, schema validation, and SPDX license handling are all powered by the official `cyclonedx-python-lib`, ensuring spec-compliant output without manual JSON construction.
+
 ---
 
 ## 🛠 Features

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,15 +18,14 @@ classifiers = [
 ]
 dependencies = [
     "beautifulsoup4>=4.11.0",
+    "cyclonedx-python-lib[json-validation]>=11.7.0",
     "datasets>=2.0.0",
     "fastapi>=0.104.0",
     "flask>=2.3.0",
     "gunicorn>=21.2.0",
     "httpx>=0.25.0",
     "huggingface_hub>=0.19.0",
     "jinja2>=3.0.0",
-    "jsonschema>=4.17.0",
-    "license-expression>=30.4.4",
     "packageurl-python>=0.11.1",
     "pydantic>=2.4.0",
     "python-multipart",
@@ -36,6 +35,7 @@ dependencies = [
     "torch>=2.0.0",
     "transformers>=4.36.0",
     "uvicorn>=0.24.0",
+    "safetensors>=0.4.0",
 ]
 
 [project.optional-dependencies]
@@ -71,5 +71,9 @@ pythonpath = [
 
 [dependency-groups]
 dev = [
+    "pytest>=7.0.0",
+    "pytest-cov>=4.0.0",
+    "pytest-mock>=3.10.0",
+    "ruff",
     "gguf>=0.6.0",
 ]
diff --git a/requirements.txt b/requirements.txt
@@ -5,23 +5,22 @@ fastapi>=0.104.0
 uvicorn>=0.24.0
 pydantic>=2.4.0
 requests>=2.31.0
-python-dotenv>=1.0.0
 PyYAML>=6.0.1
 flask>=2.3.0
 gunicorn>=21.2.0
-cyclonedx-python-lib>=4.0.0
+cyclonedx-python-lib[json-validation]>=11.7.0
+httpx>=0.25.0
 packageurl-python>=0.17.6
 python-multipart
 jinja2>=3.0.0
 datasets>=2.0.0
 beautifulsoup4>=4.11.0
-nltk>=3.8.0
-python-dateutil>=2.8.0
-jsonschema>=4.17.0
 sentencepiece>=0.1.99
+safetensors>=0.4.0
 
 # Test dependencies
 pytest>=7.0.0
 pytest-mock>=3.10.0
 pytest-cov>=4.0.0
+ruff
 gguf>=0.6.0
diff --git a/src/cli.py b/src/cli.py
@@ -1,7 +1,10 @@
 import argparse
+import logging
 import sys
 from .controllers.cli_controller import CLIController
 
+logger = logging.getLogger(__name__)
+
 def main():
     parser = argparse.ArgumentParser(description="OWASP AIBOM Generator CLI")
     parser.add_argument("model_id", nargs="?", help="Hugging Face Model ID (e.g. 'owner/model')")
@@ -13,9 +16,12 @@ def main():
     parser.add_argument("--name", "-n", help="Component name in metadata")
     parser.add_argument("--version", "-v", help="Component version in metadata")
     parser.add_argument("--manufacturer", "-m", help="Component manufacturer/supplier in metadata")
-    
+
     args = parser.parse_args()
-
+
+    log_level = logging.DEBUG if args.verbose else logging.INFO
+    logging.basicConfig(level=log_level, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
+
     controller = CLIController()
 
     if args.test:
@@ -36,22 +42,22 @@ def main():
             "CIRCL/vulnerability-severity-classification-roberta-base"
         ]
 
-        print(f"Running test mode against {len(test_models)} models...")
+        logger.info("Running test mode against %d models...", len(test_models))
         for model in test_models:
-            print(f"\n{'='*50}\nTesting model: {model}\n{'='*50}")
+            logger.info("Testing model: %s", model)
             try:
                 controller.generate(
                     model_id=model,
                     output_file=args.output,
                     include_inference=args.inference,
-                    enable_summarization=True,  # Ensure summarization is on for testing description 
+                    enable_summarization=True,  # Ensure summarization is on for testing description
                     verbose=args.verbose,
                     name=args.name,
                     version=args.version,
                     manufacturer=args.manufacturer
                 )
             except Exception as e:
-                print(f"Error testing {model}: {e}")
+                logger.error("Error testing %s: %s", model, e)
         sys.exit(0)
 
     if not args.model_id:

diff --git a/src/controllers/cli_controller.py b/src/controllers/cli_controller.py
@@ -3,7 +3,6 @@
 from typing import Optional
 from ..models.service import AIBOMService
 from ..models.scoring import calculate_completeness_score
-from ..models.scoring import calculate_completeness_score
 from ..config import OUTPUT_DIR, TEMPLATES_DIR
 from ..utils.formatter import export_aibom
 import os
@@ -25,15 +24,15 @@ def generate(self, model_id: str, output_file: Optional[str] = None, include_inf
                  enable_summarization: bool = False, verbose: bool = False,
                  name: Optional[str] = None, version: Optional[str] = None, manufacturer: Optional[str] = None):
         if verbose:
-            logging.getLogger().setLevel(logging.INFO)
-            
-        print(f"Generating AIBOM for {model_id}...")
-        
+            logging.getLogger().setLevel(logging.DEBUG)
+
+        logger.info("Generating AIBOM for %s...", model_id)
+
         versions_to_generate = ["1.6", "1.7"]
         reports = []
-        generated_aiboms = {} 
-        
-        print(f"  - Generating AIBOM model data...")
+        generated_aiboms = {}
+
+        logger.info("Generating AIBOM model data...")
         try:
             primary_aibom = self.service.generate_aibom(
                 model_id, 
@@ -85,8 +84,7 @@ def generate(self, model_id: str, output_file: Optional[str] = None, include_inf
             output_file_primary = output_file_1_6
 
         except Exception as e:
-            logger.error(f"Failed to generate SBOM: {e}", exc_info=True)
-            print(f"  ❌ Failed to generate SBOM: {e}")
+            logger.error("Failed to generate SBOM: %s", e, exc_info=True)
             reports = []
 
         if reports:
@@ -130,7 +128,7 @@ def generate(self, model_id: str, output_file: Optional[str] = None, include_inf
                     with open(html_output_file, "w") as f:
                         f.write(html_content)
 
-                    print(f"\n📄 HTML Report:\n   {html_output_file}")
+                    logger.info("HTML Report: %s", html_output_file)
 
                     # Copy static assets
                     try:
@@ -148,19 +146,19 @@ def generate(self, model_id: str, output_file: Optional[str] = None, include_inf
                             if os.path.exists(static_dst):
                                 shutil.rmtree(static_dst)
                             shutil.copytree(static_src, static_dst)
-                            # print(f"   - Static assets copied to: {static_dst}")
+                            logger.debug("Static assets copied to: %s", static_dst)
                         else:
-                            logger.warning(f"Static source directory not found: {static_src}")
+                            logger.warning("Static source directory not found: %s", static_src)
 
                     except Exception as e:
-                        logger.warning(f"Failed to copy static assets: {e}")
+                        logger.warning("Failed to copy static assets: %s", e)
 
                     # Model Description
                     if "components" in primary_aibom and primary_aibom["components"]:
                         description = primary_aibom["components"][0].get("description", "No description available")
                         if len(description) > 256:
                             description = description[:253] + "..."
-                        print(f"\n📝 Model Description:\n   {description}")
+                        logger.info("Model Description: %s", description)
 
                     # License
                     if "components" in primary_aibom and primary_aibom["components"]:
@@ -173,42 +171,39 @@ def generate(self, model_id: str, output_file: Optional[str] = None, include_inf
                                 if val:
                                     license_list.append(val)
                             if license_list:
-                                print(f"\n⚖️ License:\n   {', '.join(license_list)}")
+                                logger.info("License: %s", ", ".join(license_list))
 
                 except Exception as e:
-                    logger.warning(f"Failed to generate HTML report: {e}")
+                    logger.warning("Failed to generate HTML report: %s", e)
 
-            # Print Summary for ALL versions
             for r in reports:
                 spec = r.get("spec_version", "1.6")
-                print(f"\n✅ Successfully generated CycloneDX {spec} SBOM:")
-                print(f"   {r.get('output_file')}")
-
+                logger.info("Successfully generated CycloneDX %s SBOM: %s", spec, r.get("output_file"))
+
                 if not r["schema_validation"]["valid"]:
-                    print(f"⚠️  Schema Validation Errors ({spec}):")
+                    logger.warning("Schema Validation Errors (%s):", spec)
                     for err in r["schema_validation"]["errors"]:
-                        print(f"   - {err}")
+                        logger.warning("  - %s", err)
                 else:
-                    print(f"   - Schema Validation ({spec}): ✅ Valid")
-            
+                    logger.info("Schema Validation (%s): Valid", spec)
+
             # Display Detailed Score Summary (from primary)
             if primary_report and "final_score" in primary_report:
                 score = primary_report["final_score"]
                 t_score = score.get('total_score', 0)
                 formatted_t_score = int(t_score) if isinstance(t_score, (int, float)) and t_score == int(t_score) else t_score
-                print(f"\n📊 Completeness Score: {formatted_t_score}/100")
-                
+                logger.info("Completeness Score: %s/100", formatted_t_score)
+
                 if "completeness_profile" in score:
                     profile = score["completeness_profile"]
-                    print(f"   Profile: {profile.get('name')} - {profile.get('description')}")
-                
+                    logger.info("Profile: %s - %s", profile.get("name"), profile.get("description"))
+
                 if "section_scores" in score:
-                    print("\n📋 Section Breakdown:")
-
+                    logger.info("Section Breakdown:")
                     for section, s_score in score["section_scores"].items():
                         max_s = score.get("max_scores", {}).get(section, "?")
                         formatted_s_score = int(s_score) if isinstance(s_score, (int, float)) and s_score == int(s_score) else s_score
-                        print(f"   - {section.replace('_', ' ').title()}: {formatted_s_score}/{max_s}")
+                        logger.info("  %s: %s/%s", section.replace("_", " ").title(), formatted_s_score, max_s)
 
         else:
-             print("\n❌ Failed to generate any SBOMs.")
+            logger.error("Failed to generate any SBOMs.")
diff --git a/src/main.py b/src/main.py
@@ -85,6 +85,5 @@ async def cleanup_middleware(request: Request, call_next):
 
 if __name__ == "__main__":
     import uvicorn
-    # Print clear access URL to avoid 0.0.0.0 confusion
-    print("🚀 Application ready! Access it at: http://localhost:8000")
+    logger.info("Application ready! Access it at: http://localhost:8000")
     uvicorn.run("src.main:app", host="0.0.0.0", port=8000, reload=True)
diff --git a/src/models/__init__.py b/src/models/__init__.py
@@ -1,10 +1,10 @@
 from .schemas import (
-    DataSource, 
-    ConfidenceLevel, 
+    DataSource,
+    ConfidenceLevel,
     ExtractionResult,
-    GenerateRequest, 
-    BatchRequest, 
-    AIBOMResponse, 
+    GenerateRequest,
+    BatchRequest,
+    AIBOMResponse,
     EnhancementReport
 )
 from .registry import get_field_registry_manager

diff --git a/src/models/config_parsing.py b/src/models/config_parsing.py
@@ -0,0 +1,52 @@
+"""
+config.json parsing for HuggingFace model repositories.
+
+Extracts hyperparameters using llama.cpp's find_hparam key fallback chains.
+Works for any model format (safetensors, GGUF, pytorch, etc.) — the config.json
+schema is format-agnostic.
+"""
+from typing import Dict, List, Optional, Union
+
+# Exact key fallback order from llama.cpp convert_hf_to_gguf.py
+# (see research.md section 12.4 for source line references)
+HPARAM_KEYS: Dict[str, List[str]] = {
+    "block_count": ["n_layers", "num_hidden_layers", "n_layer", "num_layers"],
+    "context_length": ["max_position_embeddings", "n_ctx", "n_positions",
+                        "max_length", "max_sequence_length", "model_max_length"],
+    "embedding_length": ["hidden_size", "n_embd", "dim"],
+    "feed_forward_length": ["intermediate_size", "n_inner", "hidden_dim"],
+    "attention_head_count": ["num_attention_heads", "n_head", "n_heads"],
+    "attention_head_count_kv": ["num_key_value_heads", "n_kv_heads"],
+    "rope_dimension_count": ["rotary_dim", "rope_dim"],
+    "vocab_size": ["vocab_size"],
+    "architecture": ["model_type"],
+}
+
+
+ParsedConfig = Dict[str, Optional[Union[str, int]]]
+
+
+def parse_config(config: dict) -> ParsedConfig:
+    """Extract hyperparameters from config.json using llama.cpp's find_hparam key fallback chains.
+
+    Handles VLM models that nest text params under text_config (llama.cpp L800-802).
+
+    Returns a dict with canonical keys (block_count, embedding_length, etc.)
+    and None for any fields not found in the config.
+    """
+    # VLM merge: text_config values override root, mirroring llama.cpp
+    if "text_config" in config:
+        merged = dict(config)
+        merged.update(config["text_config"])
+        config = merged
+
+    result: ParsedConfig = {}
+    for canonical_name, candidate_keys in HPARAM_KEYS.items():
+        value = None
+        for key in candidate_keys:
+            if key in config:
+                value = config[key]
+                break
+        result[canonical_name] = value
+
+    return result