@@ -196,17 +196,115 @@ async def test_400_detect_mime_types( provide_tempdir, provide_auxdata ):
196196 "#!/usr/bin/env python3\n "
197197 "from __future__ import annotations\n \n "
198198 "def hello() -> str:\n return 'Python'\n " ),
199+ # Test pattern-based detection for structured text formats
200+ "config.toml" : "[package]\n name = 'test'\n " ,
201+ "data.yaml" : "key: value\n list:\n - item1\n " ,
202+ "service.json" : '{"name": "test", "version": "1.0"}\n ' ,
203+ "manifest.xml" : (
204+ '<?xml version="1.0"?><root><item>test</item></root>\n ' ),
205+ "rust_code.rs" : 'fn main() { println!("Hello, world!"); }\n ' ,
199206 }
200207
201208 with create_test_files ( provide_tempdir , test_files ):
202209 results = await acquirers .acquire ( provide_auxdata , [
203210 provide_tempdir / "plain.txt" ,
204211 provide_tempdir / "script.py" ,
212+ provide_tempdir / "config.toml" ,
213+ provide_tempdir / "data.yaml" ,
214+ provide_tempdir / "service.json" ,
215+ provide_tempdir / "manifest.xml" ,
216+ provide_tempdir / "rust_code.rs" ,
205217 ] )
206218
219+ assert len ( results ) == 7
207220 mimetypes = { part .mimetype for part in results }
221+
222+ # Existing assertions
208223 assert "text/plain" in mimetypes
209224 assert any ( "python" in mt for mt in mimetypes )
225+
226+ # Pattern-based detection assertions for recognized MIME types
227+ assert any (
228+ mt .endswith ( '+json' ) or 'json' in mt for mt in mimetypes )
229+ assert any (
230+ mt .endswith ( '+xml' ) or 'xml' in mt for mt in mimetypes )
231+
232+ # TOML and YAML files should be accepted via charset fallback
233+ # since Python's mimetypes doesn't recognize them
234+ toml_results = [
235+ p for p in results if p .location .endswith ( 'config.toml' ) ]
236+ yaml_results = [
237+ p for p in results if p .location .endswith ( 'data.yaml' ) ]
238+ assert len ( toml_results ) == 1
239+ assert len ( yaml_results ) == 1
240+
241+ # Rust files should be accepted (regression test for original issue)
242+ rust_results = [
243+ p for p in results if p .location .endswith ( 'rust_code.rs' ) ]
244+ assert len ( rust_results ) == 1
245+ assert 'application/rls-services+xml' in rust_results [ 0 ].mimetype
246+
247+
248+ @pytest .mark .asyncio
249+ async def test_410_application_x_security ( provide_tempdir , provide_auxdata ):
250+ ''' Security hardening properly rejects dangerous application/x- types. '''
251+ acquirers = cache_import_module ( f"{ PACKAGE_NAME } .acquirers" )
252+
253+ # Create test files with binary signatures that puremagic recognizes
254+ binary_files = {
255+ 'test.exe' : b'MZ\x90 \x00 ' + b'\x00 ' * 100 , # PE header
256+ 'test.dmg' : b'koly' + b'\x00 ' * 100 , # DMG trailer signature
257+ # Use obviously binary file that won't be detected as having charset
258+ 'test.bin' : bytes ( [ 0xFF , 0x00 ] * 52 ), # Alternating binary
259+ }
260+
261+ # Create safe scripting files
262+ script_files = {
263+ 'script.rb' : 'puts "Hello, Ruby!"\n ' ,
264+ 'script.py' : 'print("Hello, Python!")\n ' ,
265+ 'script.pl' : 'print "Hello, Perl!\\ n";\n ' ,
266+ 'script.php' : '<?php echo "Hello, PHP!"; ?>\n ' ,
267+ }
268+
269+ binary_paths = [ ]
270+ script_paths = [ ]
271+
272+ try :
273+ # Create binary files
274+ for filename , content in binary_files .items ( ):
275+ path = provide_tempdir / filename
276+ path .write_bytes ( content )
277+ binary_paths .append ( path )
278+
279+ # Create script files
280+ for filename , content in script_files .items ( ):
281+ path = provide_tempdir / filename
282+ path .write_text ( content )
283+ script_paths .append ( path )
284+
285+ # Test binary files are rejected in non-strict mode
286+ provide_auxdata .configuration [
287+ 'acquire-parts' ][ 'fail-on-invalid' ] = False
288+ binary_results = await acquirers .acquire (
289+ provide_auxdata , binary_paths )
290+ assert len ( binary_results ) == 0 # All binary files rejected
291+
292+ # Test script files are accepted
293+ script_results = await acquirers .acquire (
294+ provide_auxdata , script_paths )
295+ assert len ( script_results ) == len ( script_files )
296+
297+ # Verify MIME types for accepted scripts
298+ script_mimetypes = { part .mimetype for part in script_results }
299+ assert 'application/x-ruby' in script_mimetypes
300+ # Note: .py files might be detected as text/x-python, not app/x-python
301+ assert any ( 'python' in mt for mt in script_mimetypes )
302+
303+ finally :
304+ # Cleanup
305+ for path in binary_paths + script_paths :
306+ if path .exists ( ):
307+ path .unlink ( )
210308
211309
212310# Error Handling Tests
@@ -272,6 +370,60 @@ async def test_520_nontextual_mime( provide_tempdir, provide_auxdata ):
272370 if binary_path .exists ( ): binary_path .unlink ( )
273371
274372
373+ @pytest .mark .asyncio
374+ async def test_525_charset_fallback_validation (
375+ provide_tempdir , provide_auxdata
376+ ):
377+ ''' Enhanced MIME type detection accepts valid structured text files. '''
378+ acquirers = cache_import_module ( f"{ PACKAGE_NAME } .acquirers" )
379+
380+ # Test that files with unknown extensions but valid text content
381+ # are properly handled
382+ test_files = {
383+ 'code.unknown' : 'fn main() {\n println!("Hello!");\n }\n ' ,
384+ 'config.conf' : 'key=value\n section=main\n ' ,
385+ 'data.dat' : '{"valid": "json", "content": true}\n ' ,
386+ }
387+
388+ paths_to_cleanup = [ ]
389+
390+ try :
391+ provide_auxdata .configuration [
392+ 'acquire-parts' ][ 'fail-on-invalid' ] = False
393+
394+ # Create files with unknown extensions
395+ for filename , content in test_files .items ( ):
396+ path = provide_tempdir / filename
397+ path .write_text ( content )
398+ paths_to_cleanup .append ( path )
399+
400+ results = await acquirers .acquire ( provide_auxdata , paths_to_cleanup )
401+
402+ # All text files with unknown extensions should be accepted
403+ # via charset-based fallback (or immediate text/plain detection)
404+ assert len ( results ) == 3
405+
406+ # Verify they all have valid charsets
407+ for part in results :
408+ assert part .charset is not None
409+ assert part .charset in [ 'utf-8' , 'ascii' ]
410+
411+ # Test that truly empty files are handled appropriately
412+ empty_path = provide_tempdir / 'empty.unknown'
413+ empty_path .write_text ( '' )
414+ empty_results = await acquirers .acquire (
415+ provide_auxdata , [ empty_path ] )
416+ # Empty files get rejected
417+ assert len ( empty_results ) == 0
418+ paths_to_cleanup .append ( empty_path )
419+
420+ finally :
421+ # Cleanup
422+ for path in paths_to_cleanup :
423+ if path .exists ( ):
424+ path .unlink ( )
425+
426+
275427@pytest .mark .asyncio
276428async def test_530_strict_mode_handling ( provide_tempdir , provide_auxdata ):
277429 ''' Tests strict mode handling of invalid files. '''
0 commit comments