Fix charset fallback security vulnerability.

emcd · claude · emcd · commit 413cfb60bbc0 · 2025-07-27T05:04:34.000-07:00
Address security gap where files with unknown MIME types but detectable charsets bypassed content validation. Charset fallback now requires validation through _validate_mimetype_with_trial_decode() before accepting files as text/plain. Maintains security boundaries while preserving legitimate text file acceptance. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/.auxiliary/notes/mime-type-detection-test-plan.md b/.auxiliary/notes/mime-type-detection-test-plan.md
diff --git a/sources/mimeogram/acquirers.py b/sources/mimeogram/acquirers.py
@@ -188,21 +188,19 @@ def _detect_mimetype_and_charset(
         charset_ = _detect_charset( content )
     else: charset_ = charset
     if not mimetype_:
-        if charset_: mimetype_ = 'text/plain' # noqa: SIM108
-        else: mimetype_ = 'application/octet-stream'
+        if charset_:
+            mimetype_ = 'text/plain'
+            _validate_mimetype_with_trial_decode(
+                content, location, mimetype_, charset_ )
+            return mimetype_, charset_
+        mimetype_ = 'application/octet-stream'
     if _is_textual_mimetype( mimetype_ ):
         return mimetype_, charset_
     if charset_ is None:
         raise TextualMimetypeInvalidity( location, mimetype_ )
-    try: text = content.decode( charset_ )
-    except ( UnicodeDecodeError, LookupError ) as exc:
-        raise TextualMimetypeInvalidity( location, mimetype_ ) from exc
-    if _is_reasonable_text_content( text ):
-        _scribe.debug(
-            f"MIME type '{mimetype_}' accepted after successful "
-            f"decode test with charset '{charset_}' for '{location}'." )
-        return mimetype_, charset_
-    raise TextualMimetypeInvalidity( location, mimetype_ )
+    _validate_mimetype_with_trial_decode(
+        content, location, mimetype_, charset_ )
+    return mimetype_, charset_
 
 
 def _is_reasonable_text_content( content: str ) -> bool:
@@ -274,3 +272,19 @@ async def _execute_session( ) -> _parts.Part:
         ) as client: return await _acquire_via_http( client, url )
 
     return _execute_session( )
+
+
+def _validate_mimetype_with_trial_decode(
+    content: bytes, location: str | __.Path, mimetype: str, charset: str
+) -> None:
+    ''' Validates charset fallback and returns appropriate MIME type. '''
+    from .exceptions import TextualMimetypeInvalidity
+    try: text = content.decode( charset )
+    except ( UnicodeDecodeError, LookupError ) as exc:
+        raise TextualMimetypeInvalidity( location, mimetype ) from exc
+    if _is_reasonable_text_content( text ):
+        _scribe.debug(
+            f"MIME type '{mimetype}' accepted after successful "
+            f"decode test with charset '{charset}' for '{location}'." )
+        return
+    raise TextualMimetypeInvalidity( location, mimetype )
diff --git a/tests/test_000_mimeogram/test_500_acquirers.py b/tests/test_000_mimeogram/test_500_acquirers.py
@@ -196,17 +196,115 @@ async def test_400_detect_mime_types( provide_tempdir, provide_auxdata ):
             "#!/usr/bin/env python3\n"
             "from __future__ import annotations\n\n"
             "def hello() -> str:\n    return 'Python'\n" ),
+        # Test pattern-based detection for structured text formats
+        "config.toml": "[package]\nname = 'test'\n",
+        "data.yaml": "key: value\nlist:\n  - item1\n",
+        "service.json": '{"name": "test", "version": "1.0"}\n',
+        "manifest.xml": (
+            '<?xml version="1.0"?><root><item>test</item></root>\n' ),
+        "rust_code.rs": 'fn main() { println!("Hello, world!"); }\n',
     }
 
     with create_test_files( provide_tempdir, test_files ):
         results = await acquirers.acquire( provide_auxdata, [
             provide_tempdir / "plain.txt",
             provide_tempdir / "script.py",
+            provide_tempdir / "config.toml",
+            provide_tempdir / "data.yaml", 
+            provide_tempdir / "service.json",
+            provide_tempdir / "manifest.xml",
+            provide_tempdir / "rust_code.rs",
         ] )
 
+        assert len( results ) == 7
         mimetypes = { part.mimetype for part in results }
+        
+        # Existing assertions
         assert "text/plain" in mimetypes
         assert any( "python" in mt for mt in mimetypes )
+        
+        # Pattern-based detection assertions for recognized MIME types
+        assert any(
+            mt.endswith( '+json' ) or 'json' in mt for mt in mimetypes )
+        assert any(
+            mt.endswith( '+xml' ) or 'xml' in mt for mt in mimetypes )
+        
+        # TOML and YAML files should be accepted via charset fallback
+        # since Python's mimetypes doesn't recognize them
+        toml_results = [
+            p for p in results if p.location.endswith( 'config.toml' ) ]
+        yaml_results = [
+            p for p in results if p.location.endswith( 'data.yaml' ) ]
+        assert len( toml_results ) == 1
+        assert len( yaml_results ) == 1
+        
+        # Rust files should be accepted (regression test for original issue)
+        rust_results = [
+            p for p in results if p.location.endswith( 'rust_code.rs' ) ]
+        assert len( rust_results ) == 1
+        assert 'application/rls-services+xml' in rust_results[ 0 ].mimetype
+
+
+@pytest.mark.asyncio
+async def test_410_application_x_security( provide_tempdir, provide_auxdata ):
+    ''' Security hardening properly rejects dangerous application/x- types. '''
+    acquirers = cache_import_module( f"{PACKAGE_NAME}.acquirers" )
+
+    # Create test files with binary signatures that puremagic recognizes
+    binary_files = {
+        'test.exe': b'MZ\x90\x00' + b'\x00' * 100,  # PE header
+        'test.dmg': b'koly' + b'\x00' * 100,         # DMG trailer signature
+        # Use obviously binary file that won't be detected as having charset
+        'test.bin': bytes( [ 0xFF, 0x00 ] * 52 ),   # Alternating binary  
+    }
+    
+    # Create safe scripting files
+    script_files = {
+        'script.rb': 'puts "Hello, Ruby!"\n',
+        'script.py': 'print("Hello, Python!")\n', 
+        'script.pl': 'print "Hello, Perl!\\n";\n',
+        'script.php': '<?php echo "Hello, PHP!"; ?>\n',
+    }
+
+    binary_paths = [ ]
+    script_paths = [ ]
+    
+    try:
+        # Create binary files
+        for filename, content in binary_files.items( ):
+            path = provide_tempdir / filename
+            path.write_bytes( content )
+            binary_paths.append( path )
+            
+        # Create script files
+        for filename, content in script_files.items( ):
+            path = provide_tempdir / filename
+            path.write_text( content )
+            script_paths.append( path )
+
+        # Test binary files are rejected in non-strict mode
+        provide_auxdata.configuration[
+            'acquire-parts' ][ 'fail-on-invalid' ] = False
+        binary_results = await acquirers.acquire(
+            provide_auxdata, binary_paths )
+        assert len( binary_results ) == 0  # All binary files rejected
+        
+        # Test script files are accepted
+        script_results = await acquirers.acquire(
+            provide_auxdata, script_paths )
+        assert len( script_results ) == len( script_files )
+        
+        # Verify MIME types for accepted scripts
+        script_mimetypes = { part.mimetype for part in script_results }
+        assert 'application/x-ruby' in script_mimetypes
+        # Note: .py files might be detected as text/x-python, not app/x-python
+        assert any( 'python' in mt for mt in script_mimetypes )
+        
+    finally:
+        # Cleanup
+        for path in binary_paths + script_paths:
+            if path.exists( ):
+                path.unlink( )
 
 
 # Error Handling Tests
@@ -272,6 +370,60 @@ async def test_520_nontextual_mime( provide_tempdir, provide_auxdata ):
         if binary_path.exists( ): binary_path.unlink( )
 
 
+@pytest.mark.asyncio
+async def test_525_charset_fallback_validation(
+    provide_tempdir, provide_auxdata
+):
+    ''' Enhanced MIME type detection accepts valid structured text files. '''
+    acquirers = cache_import_module( f"{PACKAGE_NAME}.acquirers" )
+    
+    # Test that files with unknown extensions but valid text content
+    # are properly handled
+    test_files = {
+        'code.unknown': 'fn main() {\n    println!("Hello!");\n}\n',
+        'config.conf': 'key=value\nsection=main\n',  
+        'data.dat': '{"valid": "json", "content": true}\n',
+    }
+    
+    paths_to_cleanup = [ ]
+    
+    try:
+        provide_auxdata.configuration[
+            'acquire-parts' ][ 'fail-on-invalid' ] = False
+            
+        # Create files with unknown extensions
+        for filename, content in test_files.items( ):
+            path = provide_tempdir / filename
+            path.write_text( content )
+            paths_to_cleanup.append( path )
+            
+        results = await acquirers.acquire( provide_auxdata, paths_to_cleanup )
+        
+        # All text files with unknown extensions should be accepted
+        # via charset-based fallback (or immediate text/plain detection)
+        assert len( results ) == 3
+        
+        # Verify they all have valid charsets
+        for part in results:
+            assert part.charset is not None
+            assert part.charset in [ 'utf-8', 'ascii' ]
+            
+        # Test that truly empty files are handled appropriately
+        empty_path = provide_tempdir / 'empty.unknown'
+        empty_path.write_text( '' )
+        empty_results = await acquirers.acquire(
+            provide_auxdata, [ empty_path ] )
+        # Empty files get rejected
+        assert len( empty_results ) == 0
+        paths_to_cleanup.append( empty_path )
+        
+    finally:
+        # Cleanup
+        for path in paths_to_cleanup:
+            if path.exists( ):
+                path.unlink( )
+
+
 @pytest.mark.asyncio
 async def test_530_strict_mode_handling( provide_tempdir, provide_auxdata ):
     ''' Tests strict mode handling of invalid files. '''