-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconvert_assembly_transcript.sh
More file actions
executable file
·48 lines (39 loc) · 1.04 KB
/
convert_assembly_transcript.sh
File metadata and controls
executable file
·48 lines (39 loc) · 1.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/bin/sh
#
# Project: shell-scripts
# Author: Ken Zalewski
# Organization: New York State Senate
# Date: 2020-09-02
# Revised: 2020-09-25 - eliminate non-printables in text using -enc option
#
prog=`basename $0`
usage() {
echo "Usage: $prog pdf_file [pdf_file ...]" >&2
}
if [ $# -lt 1 ]; then
echo "$prog: At least one PDF file must be specified" >&2
usage
exit 1
fi
rc=0
for pdf_file in "$@"; do
if [ ! -r "$pdf_file" ]; then
echo "$prog: $pdf_file: File not found; skipping" >&2
rc=1
continue
elif file "$pdf_file" | grep "PDF"; then
echo "Converting PDF file [$pdf_file] to text"
else
echo "$prog: $pdf_file is not a valid PDF file; skipping" >&2
rc=1
continue
fi
txt_file=`basename "$pdf_file" .pdf`".txt"
pdftotext -layout -enc ASCII7 "$pdf_file" "$txt_file"
echo "Checking text file [$txt_file] for non-printables"
if grep --color='auto' -P -n "[^\x0c\x20-\x7F]" "$txt_file"; then
echo "$prog: Warning: Non-printable characters found in text file [$txt_file]" >&2
rc=1
fi
done
exit $rc