Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions tests/memory_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python
# encoding: utf-8
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
Expand All @@ -17,13 +16,14 @@
#
# To run:
# python tika/tests/memory_benchmark.py
import gzip
import os
import zlib
import gzip

import tika.parser
from memory_profiler import profile

import tika.parser


@profile
def test_parser_binary():
Expand Down
4 changes: 1 addition & 3 deletions tests/test_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python
# encoding: utf-8
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
Expand All @@ -16,11 +15,10 @@
# limitations under the License.
#
# pytest --benchmark-enable --benchmark-timer=time.process_time tika/tests/test_benchmark.py
# pytest --benchmark-enable --benchmark-timer=time.process_time tika/tests/test_benchmark.py
import gzip
import os
import unittest
import zlib
import gzip
from http import HTTPStatus

import tika.parser
Expand Down
8 changes: 2 additions & 6 deletions tests/test_from_file_service.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python
# encoding: utf-8
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
Expand All @@ -17,12 +16,9 @@
#
# python -m unittest tika.tests.test_from_file_service

import sys
import unittest
if sys.version_info >= (3, 3):
from unittest import mock
else:
import mock
from unittest import mock

import tika.parser


Expand Down
1 change: 0 additions & 1 deletion tests/test_tika.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python
# encoding: utf-8
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
Expand Down
14 changes: 8 additions & 6 deletions tests/tests_params.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
# encoding: utf-8

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
Expand All @@ -19,14 +19,16 @@
#https://docs.python.org/2/library/unittest.html
#http://eli.thegreenplace.net/2011/08/02/python-unit-testing-parametrized-test-cases
#public domain license reference: http://eli.thegreenplace.net/pages/code

#Run
#python tika/tests/tests_params.py

import csv
import unittest

import tika.parser


class CreateTest(unittest.TestCase):
"test for file types"
def __init__(self, methodName='runTest', param1=None, param2=None):
Expand Down Expand Up @@ -64,17 +66,17 @@ def test_suite():
try:
suite.addTest(CreateTest.parameterize(RemoteTest,param1=x))
except IOError as e:
print(e.strerror)
return suite
print(e.strerror)
return suite

def test_url():
with open('tika/tests/arguments/test_remote_content.csv', 'r') as csvfile:
urlread = csv.reader(csvfile)
for url in urlread:
yield url[1]



if __name__ == '__main__':
suite = test_suite()
unittest.TextTestRunner(verbosity=2).run(suite)
unittest.TextTestRunner(verbosity=2).run(suite)
3 changes: 1 addition & 2 deletions tests/tests_unpack.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# coding=utf8

import unittest
from tempfile import NamedTemporaryFile

from tika import unpack


Expand Down
2 changes: 1 addition & 1 deletion tika/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# encoding: utf-8
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
Expand All @@ -17,6 +16,7 @@
__version__ = "3.1.0"

from pkgutil import extend_path

__path__ = extend_path(__path__, __name__)

def initVM():
Expand Down
5 changes: 3 additions & 2 deletions tika/config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
# encoding: utf-8

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
Expand All @@ -14,10 +14,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#

from .tika import getConfig


def getParsers():
return getConfig('parsers')[1]

Expand Down
6 changes: 3 additions & 3 deletions tika/detector.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python
# encoding: utf-8
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
Expand All @@ -14,9 +13,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#

from .tika import ServerEndpoint, callServer, detectType1

from .tika import detectType1, callServer, ServerEndpoint

def from_file(filename, config_path=None, requestOptions={}):
'''
Expand Down
7 changes: 4 additions & 3 deletions tika/language.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
# encoding: utf-8

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
Expand All @@ -14,9 +14,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#

from .tika import ServerEndpoint, callServer, detectLang1

from .tika import detectLang1, callServer, ServerEndpoint

def from_file(filename, requestOptions={}):
'''
Expand Down
6 changes: 4 additions & 2 deletions tika/parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
# encoding: utf-8

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
Expand All @@ -16,9 +16,11 @@
# limitations under the License.
#

from .tika import parse1, callServer, ServerEndpoint
import json

from .tika import ServerEndpoint, callServer, parse1


def from_file(filename, serverEndpoint=ServerEndpoint, service='all', xmlContent=False, headers=None, config_path=None, requestOptions={}, raw_response=False):
'''
Parses a file for metadata and content
Expand Down
7 changes: 5 additions & 2 deletions tika/pdf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
# encoding: utf-8

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
Expand All @@ -16,10 +16,13 @@
# limitations under the License.
#

from tika import parser
from io import StringIO

from bs4 import BeautifulSoup

from tika import parser


def text_from_pdf_pages(filename):
pages_txt = []

Expand Down
Loading
Loading