Skip to content

Commit 413cfb6

Browse files
emcdclaude
andcommitted
Fix charset fallback security vulnerability.
Address security gap where files with unknown MIME types but detectable charsets bypassed content validation. Charset fallback now requires validation through _validate_mimetype_with_trial_decode() before accepting files as text/plain. Maintains security boundaries while preserving legitimate text file acceptance. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent d4c694c commit 413cfb6

3 files changed

Lines changed: 177 additions & 114 deletions

File tree

.auxiliary/notes/mime-type-detection-test-plan.md

Lines changed: 0 additions & 103 deletions
This file was deleted.

sources/mimeogram/acquirers.py

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -188,21 +188,19 @@ def _detect_mimetype_and_charset(
188188
charset_ = _detect_charset( content )
189189
else: charset_ = charset
190190
if not mimetype_:
191-
if charset_: mimetype_ = 'text/plain' # noqa: SIM108
192-
else: mimetype_ = 'application/octet-stream'
191+
if charset_:
192+
mimetype_ = 'text/plain'
193+
_validate_mimetype_with_trial_decode(
194+
content, location, mimetype_, charset_ )
195+
return mimetype_, charset_
196+
mimetype_ = 'application/octet-stream'
193197
if _is_textual_mimetype( mimetype_ ):
194198
return mimetype_, charset_
195199
if charset_ is None:
196200
raise TextualMimetypeInvalidity( location, mimetype_ )
197-
try: text = content.decode( charset_ )
198-
except ( UnicodeDecodeError, LookupError ) as exc:
199-
raise TextualMimetypeInvalidity( location, mimetype_ ) from exc
200-
if _is_reasonable_text_content( text ):
201-
_scribe.debug(
202-
f"MIME type '{mimetype_}' accepted after successful "
203-
f"decode test with charset '{charset_}' for '{location}'." )
204-
return mimetype_, charset_
205-
raise TextualMimetypeInvalidity( location, mimetype_ )
201+
_validate_mimetype_with_trial_decode(
202+
content, location, mimetype_, charset_ )
203+
return mimetype_, charset_
206204

207205

208206
def _is_reasonable_text_content( content: str ) -> bool:
@@ -274,3 +272,19 @@ async def _execute_session( ) -> _parts.Part:
274272
) as client: return await _acquire_via_http( client, url )
275273

276274
return _execute_session( )
275+
276+
277+
def _validate_mimetype_with_trial_decode(
278+
content: bytes, location: str | __.Path, mimetype: str, charset: str
279+
) -> None:
280+
''' Validates charset fallback and returns appropriate MIME type. '''
281+
from .exceptions import TextualMimetypeInvalidity
282+
try: text = content.decode( charset )
283+
except ( UnicodeDecodeError, LookupError ) as exc:
284+
raise TextualMimetypeInvalidity( location, mimetype ) from exc
285+
if _is_reasonable_text_content( text ):
286+
_scribe.debug(
287+
f"MIME type '{mimetype}' accepted after successful "
288+
f"decode test with charset '{charset}' for '{location}'." )
289+
return
290+
raise TextualMimetypeInvalidity( location, mimetype )

tests/test_000_mimeogram/test_500_acquirers.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,17 +196,115 @@ async def test_400_detect_mime_types( provide_tempdir, provide_auxdata ):
196196
"#!/usr/bin/env python3\n"
197197
"from __future__ import annotations\n\n"
198198
"def hello() -> str:\n return 'Python'\n" ),
199+
# Test pattern-based detection for structured text formats
200+
"config.toml": "[package]\nname = 'test'\n",
201+
"data.yaml": "key: value\nlist:\n - item1\n",
202+
"service.json": '{"name": "test", "version": "1.0"}\n',
203+
"manifest.xml": (
204+
'<?xml version="1.0"?><root><item>test</item></root>\n' ),
205+
"rust_code.rs": 'fn main() { println!("Hello, world!"); }\n',
199206
}
200207

201208
with create_test_files( provide_tempdir, test_files ):
202209
results = await acquirers.acquire( provide_auxdata, [
203210
provide_tempdir / "plain.txt",
204211
provide_tempdir / "script.py",
212+
provide_tempdir / "config.toml",
213+
provide_tempdir / "data.yaml",
214+
provide_tempdir / "service.json",
215+
provide_tempdir / "manifest.xml",
216+
provide_tempdir / "rust_code.rs",
205217
] )
206218

219+
assert len( results ) == 7
207220
mimetypes = { part.mimetype for part in results }
221+
222+
# Existing assertions
208223
assert "text/plain" in mimetypes
209224
assert any( "python" in mt for mt in mimetypes )
225+
226+
# Pattern-based detection assertions for recognized MIME types
227+
assert any(
228+
mt.endswith( '+json' ) or 'json' in mt for mt in mimetypes )
229+
assert any(
230+
mt.endswith( '+xml' ) or 'xml' in mt for mt in mimetypes )
231+
232+
# TOML and YAML files should be accepted via charset fallback
233+
# since Python's mimetypes doesn't recognize them
234+
toml_results = [
235+
p for p in results if p.location.endswith( 'config.toml' ) ]
236+
yaml_results = [
237+
p for p in results if p.location.endswith( 'data.yaml' ) ]
238+
assert len( toml_results ) == 1
239+
assert len( yaml_results ) == 1
240+
241+
# Rust files should be accepted (regression test for original issue)
242+
rust_results = [
243+
p for p in results if p.location.endswith( 'rust_code.rs' ) ]
244+
assert len( rust_results ) == 1
245+
assert 'application/rls-services+xml' in rust_results[ 0 ].mimetype
246+
247+
248+
@pytest.mark.asyncio
249+
async def test_410_application_x_security( provide_tempdir, provide_auxdata ):
250+
''' Security hardening properly rejects dangerous application/x- types. '''
251+
acquirers = cache_import_module( f"{PACKAGE_NAME}.acquirers" )
252+
253+
# Create test files with binary signatures that puremagic recognizes
254+
binary_files = {
255+
'test.exe': b'MZ\x90\x00' + b'\x00' * 100, # PE header
256+
'test.dmg': b'koly' + b'\x00' * 100, # DMG trailer signature
257+
# Use obviously binary file that won't be detected as having charset
258+
'test.bin': bytes( [ 0xFF, 0x00 ] * 52 ), # Alternating binary
259+
}
260+
261+
# Create safe scripting files
262+
script_files = {
263+
'script.rb': 'puts "Hello, Ruby!"\n',
264+
'script.py': 'print("Hello, Python!")\n',
265+
'script.pl': 'print "Hello, Perl!\\n";\n',
266+
'script.php': '<?php echo "Hello, PHP!"; ?>\n',
267+
}
268+
269+
binary_paths = [ ]
270+
script_paths = [ ]
271+
272+
try:
273+
# Create binary files
274+
for filename, content in binary_files.items( ):
275+
path = provide_tempdir / filename
276+
path.write_bytes( content )
277+
binary_paths.append( path )
278+
279+
# Create script files
280+
for filename, content in script_files.items( ):
281+
path = provide_tempdir / filename
282+
path.write_text( content )
283+
script_paths.append( path )
284+
285+
# Test binary files are rejected in non-strict mode
286+
provide_auxdata.configuration[
287+
'acquire-parts' ][ 'fail-on-invalid' ] = False
288+
binary_results = await acquirers.acquire(
289+
provide_auxdata, binary_paths )
290+
assert len( binary_results ) == 0 # All binary files rejected
291+
292+
# Test script files are accepted
293+
script_results = await acquirers.acquire(
294+
provide_auxdata, script_paths )
295+
assert len( script_results ) == len( script_files )
296+
297+
# Verify MIME types for accepted scripts
298+
script_mimetypes = { part.mimetype for part in script_results }
299+
assert 'application/x-ruby' in script_mimetypes
300+
# Note: .py files might be detected as text/x-python, not app/x-python
301+
assert any( 'python' in mt for mt in script_mimetypes )
302+
303+
finally:
304+
# Cleanup
305+
for path in binary_paths + script_paths:
306+
if path.exists( ):
307+
path.unlink( )
210308

211309

212310
# Error Handling Tests
@@ -272,6 +370,60 @@ async def test_520_nontextual_mime( provide_tempdir, provide_auxdata ):
272370
if binary_path.exists( ): binary_path.unlink( )
273371

274372

373+
@pytest.mark.asyncio
374+
async def test_525_charset_fallback_validation(
375+
provide_tempdir, provide_auxdata
376+
):
377+
''' Enhanced MIME type detection accepts valid structured text files. '''
378+
acquirers = cache_import_module( f"{PACKAGE_NAME}.acquirers" )
379+
380+
# Test that files with unknown extensions but valid text content
381+
# are properly handled
382+
test_files = {
383+
'code.unknown': 'fn main() {\n println!("Hello!");\n}\n',
384+
'config.conf': 'key=value\nsection=main\n',
385+
'data.dat': '{"valid": "json", "content": true}\n',
386+
}
387+
388+
paths_to_cleanup = [ ]
389+
390+
try:
391+
provide_auxdata.configuration[
392+
'acquire-parts' ][ 'fail-on-invalid' ] = False
393+
394+
# Create files with unknown extensions
395+
for filename, content in test_files.items( ):
396+
path = provide_tempdir / filename
397+
path.write_text( content )
398+
paths_to_cleanup.append( path )
399+
400+
results = await acquirers.acquire( provide_auxdata, paths_to_cleanup )
401+
402+
# All text files with unknown extensions should be accepted
403+
# via charset-based fallback (or immediate text/plain detection)
404+
assert len( results ) == 3
405+
406+
# Verify they all have valid charsets
407+
for part in results:
408+
assert part.charset is not None
409+
assert part.charset in [ 'utf-8', 'ascii' ]
410+
411+
# Test that truly empty files are handled appropriately
412+
empty_path = provide_tempdir / 'empty.unknown'
413+
empty_path.write_text( '' )
414+
empty_results = await acquirers.acquire(
415+
provide_auxdata, [ empty_path ] )
416+
# Empty files get rejected
417+
assert len( empty_results ) == 0
418+
paths_to_cleanup.append( empty_path )
419+
420+
finally:
421+
# Cleanup
422+
for path in paths_to_cleanup:
423+
if path.exists( ):
424+
path.unlink( )
425+
426+
275427
@pytest.mark.asyncio
276428
async def test_530_strict_mode_handling( provide_tempdir, provide_auxdata ):
277429
''' Tests strict mode handling of invalid files. '''

0 commit comments

Comments
 (0)