From 3fe18abada596fa9899d2f93298009be6985bf2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Heinz-Alexander=20F=C3=BCtterer?= <35225576+afuetterer@users.noreply.github.com> Date: Mon, 9 Mar 2026 18:35:03 +0100 Subject: [PATCH 1/2] test: convert unittest style tests to pytest test functions --- tests/test_benchmark.py | 73 +++++++++++++++----------------------- tests/test_tika.py | 77 ++++++++++++++++++++--------------------- tests/tests_unpack.py | 77 ++++++++++++++++++++--------------------- 3 files changed, 103 insertions(+), 124 deletions(-) diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 88ddc5fd..2fc3b6ec 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -15,10 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # -# pytest --benchmark-enable --benchmark-timer=time.process_time tika/tests/test_benchmark.py -# pytest --benchmark-enable --benchmark-timer=time.process_time tika/tests/test_benchmark.py -import os -import unittest +# pytest --benchmark-enable --benchmark-timer=time.process_time tests/test_benchmark.py + +from pathlib import Path import zlib import gzip from http import HTTPStatus @@ -26,91 +25,73 @@ import tika.parser +TEST_FILE_PATH = Path(__file__).parent / "files" / "rwservlet.pdf" +HEADERS = {"Accept-Encoding": "gzip, deflate"} + + def test_local_binary(benchmark): """parse file binary""" - file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf') - response = benchmark(tika_from_binary, file) - - assert response['status'] == HTTPStatus.OK + response = benchmark(tika_from_binary, TEST_FILE_PATH) + assert response["status"] == HTTPStatus.OK def test_parser_buffer(benchmark): """example how to send gzip file""" - file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf') - response = benchmark(tika_from_buffer, file) - - assert response['status'] == HTTPStatus.OK + response = benchmark(tika_from_buffer, TEST_FILE_PATH) + assert response["status"] == HTTPStatus.OK def test_parser_buffer_zlib_input(benchmark): """example how to send gzip file""" - file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf') - - response = benchmark(tika_from_buffer_zlib, file) - - assert response['status'] == HTTPStatus.OK + response = benchmark(tika_from_buffer_zlib, TEST_FILE_PATH) + assert response["status"] == HTTPStatus.OK def test_parser_buffer_gzip_input(benchmark): """parse file binary""" - file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf') - response = benchmark(tika_from_buffer_gzip, file) - - assert response['status'] == HTTPStatus.OK + response = benchmark(tika_from_buffer_gzip, TEST_FILE_PATH) + assert response["status"] == HTTPStatus.OK def test_local_binary_with_gzip_output(benchmark): """parse file binary""" - file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf') - response = benchmark(tika_from_binary, file, headers={'Accept-Encoding': 'gzip, deflate'}) - - assert response['status'] == HTTPStatus.OK + response = benchmark(tika_from_binary, TEST_FILE_PATH, headers=HEADERS) + assert response["status"] == HTTPStatus.OK def test_parser_buffer_with_gzip_output(benchmark): """example how to send gzip file""" - file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf') - response = benchmark(tika_from_buffer, file, headers={'Accept-Encoding': 'gzip, deflate'}) - - assert response['status'] == HTTPStatus.OK + response = benchmark(tika_from_buffer, TEST_FILE_PATH, headers=HEADERS) + assert response["status"] == HTTPStatus.OK def test_parser_buffer_zlib_input_and_gzip_output(benchmark): """example how to send gzip file""" - file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf') - - response = benchmark(tika_from_buffer_zlib, file, headers={'Accept-Encoding': 'gzip, deflate'}) - - assert response['status'] == HTTPStatus.OK + response = benchmark(tika_from_buffer_zlib, TEST_FILE_PATH, headers=HEADERS) + assert response["status"] == HTTPStatus.OK def test_parser_buffer_gzip_input_and_gzip_output(benchmark): """parse file binary""" - file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf') - response = benchmark(tika_from_buffer_gzip, file, headers={'Accept-Encoding': 'gzip, deflate'}) - - assert response['status'] == HTTPStatus.OK + response = benchmark(tika_from_buffer_gzip, TEST_FILE_PATH, headers=HEADERS) + assert response["status"] == HTTPStatus.OK def tika_from_buffer_zlib(file, headers=None): - with open(file, 'rb') as file_obj: + with open(file, "rb") as file_obj: return tika.parser.from_buffer(zlib.compress(file_obj.read()), headers=headers) def tika_from_buffer_gzip(file, headers=None): - with open(file, 'rb') as file_obj: + with open(file, "rb") as file_obj: return tika.parser.from_buffer(gzip.compress(file_obj.read()), headers=headers) def tika_from_buffer(file, headers=None): - with open(file, 'rb') as file_obj: + with open(file, "rb") as file_obj: return tika.parser.from_buffer(file_obj.read(), headers=headers) def tika_from_binary(file, headers=None): - with open(file, 'rb') as file_obj: + with open(file, "rb") as file_obj: return tika.parser.from_file(file_obj, headers=headers) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/test_tika.py b/tests/test_tika.py index c61cb812..98fa80ae 100644 --- a/tests/test_tika.py +++ b/tests/test_tika.py @@ -15,58 +15,57 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import unittest +from pathlib import Path from http import HTTPStatus import tika.parser import tika.tika -class CreateTest(unittest.TestCase): - """test for file types""" +TEST_FILE_PATH = Path(__file__).parent / "files" / "rwservlet.pdf" - def test_remote_pdf(self): - """parse remote PDF""" - self.assertTrue(tika.parser.from_file( - 'https://upload.wikimedia.org/wikipedia/commons/4/42/Article_feedback_flow_B_-_Thank_editors.pdf')) - def test_remote_html(self): - """parse remote HTML""" - self.assertTrue(tika.parser.from_file('http://neverssl.com/index.html')) +def test_remote_pdf(): + """parse remote PDF""" + assert tika.parser.from_file( + "https://upload.wikimedia.org/wikipedia/commons/4/42/Article_feedback_flow_B_-_Thank_editors.pdf") - def test_remote_mp3(self): - """parse remote mp3""" - self.assertTrue(tika.parser.from_file( - 'https://archive.org/download/Ainst-Spaceshipdemo.mp3/Ainst-Spaceshipdemo.mp3')) - def test_remote_jpg(self): - """parse remote jpg""" - self.assertTrue(tika.parser.from_file( - 'https://upload.wikimedia.org/wikipedia/commons/b/b7/X_logo.jpg')) +def test_remote_html(): + """parse remote HTML""" + assert tika.parser.from_file("http://neverssl.com/index.html") - def test_local_binary(self): - """parse file binary""" - file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf') - with open(file, 'rb') as file_obj: - self.assertTrue(tika.parser.from_file(file_obj)) - def test_local_buffer(self): - response = tika.parser.from_buffer('Good evening, Dave') - self.assertEqual(response['status'], HTTPStatus.OK) +def test_remote_mp3(): + """parse remote mp3""" + assert tika.parser.from_file( + "https://archive.org/download/Ainst-Spaceshipdemo.mp3/Ainst-Spaceshipdemo.mp3") - def test_local_path(self): - """parse file path""" - file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf') - self.assertTrue(tika.parser.from_file(file)) - def test_kill_server(self): - """parse some file then kills server""" - file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf') - with open(file, 'rb') as file_obj: - tika.parser.from_file(file_obj) - self.assertIsNone(tika.tika.killServer()) +def test_remote_jpg(): + """parse remote jpg""" + assert tika.parser.from_file( + "https://upload.wikimedia.org/wikipedia/commons/b/b7/X_logo.jpg") -if __name__ == '__main__': - unittest.main() +def test_local_binary(): + """parse file binary""" + with open(TEST_FILE_PATH, "rb") as file_obj: + assert tika.parser.from_file(file_obj) + + +def test_local_buffer(): + response = tika.parser.from_buffer("Good evening, Dave") + assert response["status"] == HTTPStatus.OK + + +def test_local_path(): + """parse file path""" + assert tika.parser.from_file(str(TEST_FILE_PATH)) + + +def test_kill_server(): + """parse some file then kills server""" + with open(TEST_FILE_PATH, "rb") as file_obj: + tika.parser.from_file(file_obj) + assert tika.tika.killServer() is None diff --git a/tests/tests_unpack.py b/tests/tests_unpack.py index 38a6b415..cc19f88f 100644 --- a/tests/tests_unpack.py +++ b/tests/tests_unpack.py @@ -1,43 +1,42 @@ -# coding=utf8 - -import unittest from tempfile import NamedTemporaryFile + from tika import unpack -class CreateTest(unittest.TestCase): - "Test different encodings" - text_utf8 = u"Hello, world!! 😎 👽" - text_ascii = u"Hello, world!!" - - def test_utf8(self): - with NamedTemporaryFile("w+b", prefix='tika-python', suffix='.txt', dir='/tmp') as f: - f.write(self.text_utf8.encode("utf8")) - f.flush() - f.seek(0) - parsed = unpack.from_file(f.name) - self.assertEqual(parsed["content"].strip(), self.text_utf8) - - def test_ascii(self): - with NamedTemporaryFile("w+t", prefix='tika-python', suffix='.txt', dir='/tmp') as f: - f.write(self.text_ascii) - f.flush() - f.seek(0) - parsed = unpack.from_file(f.name) - self.assertEqual(parsed["content"].strip(), self.text_ascii) - - def test_from_buffer(self): - parsed = unpack.from_buffer('what?') - self.assertIsNotNone(parsed) - self.assertIsNotNone(parsed["metadata"]) - self.assertEqual(parsed["metadata"]["Content-Length"], "5") - - def test_from_buffer_with_headers(self): - parsed = unpack.from_buffer('what?', headers={'Param': 'whatever'}) - self.assertIsNotNone(parsed) - self.assertIsNotNone(parsed["metadata"]) - self.assertEqual(parsed["metadata"]["Content-Length"], "5") - - -if __name__ == '__main__': - unittest.main() +# Test data +TEXT_UTF8 = "Hello, world!! 😎 👽" +TEXT_ASCII = "Hello, world!!" + + +def test_utf8(): + """Test UTF-8 encoding""" + with NamedTemporaryFile("w+b", prefix="tika-python", suffix=".txt", dir="/tmp") as f: + f.write(TEXT_UTF8.encode("utf8")) + f.flush() + f.seek(0) + parsed = unpack.from_file(f.name) + assert parsed["content"].strip() == TEXT_UTF8 + + +def test_ascii(): + """Test ASCII encoding""" + with NamedTemporaryFile("w+t", prefix="tika-python", suffix=".txt", dir="/tmp") as f: + f.write(TEXT_ASCII) + f.flush() + f.seek(0) + parsed = unpack.from_file(f.name) + assert parsed["content"].strip() == TEXT_ASCII + + +def test_from_buffer(): + parsed = unpack.from_buffer("what?") + assert parsed is not None + assert parsed["metadata"] is not None + assert parsed["metadata"]["Content-Length"] == "5" + + +def test_from_buffer_with_headers(): + parsed = unpack.from_buffer("what?", headers={"Param": "whatever"}) + assert parsed is not None + assert parsed["metadata"] is not None + assert parsed["metadata"]["Content-Length"] == "5" From 43af386c4237a32c957627aa25073424430087e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Heinz-Alexander=20F=C3=BCtterer?= <35225576+afuetterer@users.noreply.github.com> Date: Tue, 10 Mar 2026 18:21:14 +0100 Subject: [PATCH 2/2] test: rename test module so pytest picks it up --- tests/{tests_unpack.py => test_unpack.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{tests_unpack.py => test_unpack.py} (100%) diff --git a/tests/tests_unpack.py b/tests/test_unpack.py similarity index 100% rename from tests/tests_unpack.py rename to tests/test_unpack.py