diff --git a/.gitignore b/.gitignore index 5f5032e..639fa2c 100644 --- a/.gitignore +++ b/.gitignore @@ -108,4 +108,18 @@ ENV/ .mypy_cache/ # VScode -.vscode/ \ No newline at end of file +.vscode/ + +# Windows dependencies / batch files +cwRsync*/* +*.exe +*.bat + +# nltk data directory +src/nltk_data/** + +# Wget temporary directory +*gutenberg*/ + +# Jupyter notebooks for processing data +*.ipynb \ No newline at end of file diff --git a/README.md b/README.md index 7bd926f..ef32add 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,23 @@ SPGC-2018-07-18 contains the `tokens/` and `counts/` files of all books that wer For **most other use cases**, however, you probably want the latest, most recent version of the corpus, in which case you should use this repository to **generate the corpus locally** on your computer. In particular, you will need to generate the corpus locally if you need to work with the original full text files in `raw/` and `text/`, since these are not included in the SPGC-2018-07-18 Zenodo dataset. +## Changes in this fork +- Windows support (still need to install `wget` and `cwRsync` (cwRsync tested with 5.4.1) +- Patched stuffs in original code: + - unwanted garbage in bookshelves info (probably due to Project Gutenberg website updating) + - oversights (bookshelves info are never fetched, nltk missing download, utf-8 decoding error in ebook header, etc.) + - bugs & typos +- Parallelised text processing +- Additional arguments for customisation (see [**Usage**](#usage) section) +> **Note:** +> this fork has only been tested on Windows (yet), but should work on other platforms unless the original code doesn't work in the first place? + +## Todo +- Better tokenisation rules? + - Chinese books are all empty after tokenisation -> use jieba, probably? + - Only tokens that return `True` for `str.isalpha()` are kept currently +- Faster method for getting bookselves info? + ## Installation :warning: **Python 2.x is not supported** Please make sure your system runs Python 3.x. (https://pythonclock.org/). @@ -43,7 +60,7 @@ python get_data.py This will download a copy of all UTF-8 books in PG and will create a csv file with metadata (e.g. author, title, year, ...). Notice that if you already have some of the data, the program will only download those you are missing (we use `rsync` for this). It is hence easy to update the dataset periodically to keep it up-to-date by just running `get_data.py`. - +> For Windows users, see [**Usage**](#usage) section ## Processing the data To process all the data in the `raw/` directory, run @@ -51,6 +68,73 @@ To process all the data in the `raw/` directory, run python process_data.py ``` This will fill in the `text/`, `tokens/` and `counts/` folders. +> To avoid losing ebooks that are actually UTF-8 but mistakenly removed in the original code, see [**Usage**](#usage) section +## Usage +**Recommended usage for `get_data.py` (Windows user):** +```bash +python get_data.py --rsync "cwRsync_5.4.1/rsync" +``` +(replace `cwRsync_5.4.1/rsync` with path to your rsync binary, `.exe` is not needed) +**Recommended usage for `process_data.py`:** +```bash +python process_data.py --ignore +``` +**How to use `get_data.py` with customisation options:** +``` +python get_data.py --help +usage: Update local PG repository. + +This script will download all books currently not in your +local copy of PG and get the latest version of the metadata. + + [-h] [-m MIRROR] [-r RAW] [-M METADATA] [-p PATTERN] [-k] [-owr] [-q] [-c] [--rsync RSYNC] [--procedures PROCEDURES] + +options: + -h, --help show this help message and exit + -m MIRROR, --mirror MIRROR + Path to the mirror folder that will be updated via rsync. + -r RAW, --raw RAW Path to the raw folder. + -M METADATA, --metadata METADATA + Path to the metadata folder. + -p PATTERN, --pattern PATTERN + Patterns to get only a subset of books. + -k, --keep_rdf If there is an RDF file in metadata dir, do not overwrite it. + -owr, --overwrite_raw + Overwrite files in raw. + -q, --quiet Quiet mode, do not print info, warnings, etc + -c, --clean Clean the mirror directory to remove any empty folders + --rsync RSYNC Specify an alternative rsync command + --procedures PROCEDURES + Procedures to go through, defaults to "pdlmbs": [p]ull mirror files; find [d]uplicates; hard [l]ink from mirror to raw; + get [m]etadata; get [b]ookshelf information; [s]tore bookshelf information +``` + +**How to use `process_data.py` with customisation options:** +``` +python process_data.py --help +usage: Processing raw texts from Project Gutenberg: i) removing headers,ii) tokenizing, and iii) counting words. + [-h] [-r RAW] [-ote OUTPUT_TEXT] [-oto OUTPUT_TOKENS] [-oco OUTPUT_COUNTS] [-p PATTERN] [-q] [-l LOG_FILE] [-c] [--ignore] + [--pool {process,thread}] + +options: + -h, --help show this help message and exit + -r RAW, --raw RAW Path to the raw-folder + -ote OUTPUT_TEXT, --output_text OUTPUT_TEXT + Path to text-output (text_dir) + -oto OUTPUT_TOKENS, --output_tokens OUTPUT_TOKENS + Path to tokens-output (tokens_dir) + -oco OUTPUT_COUNTS, --output_counts OUTPUT_COUNTS + Path to counts-output (counts_dir) + -p PATTERN, --pattern PATTERN + Pattern to specify a subset of books + -q, --quiet Quiet mode, do not print info, warnings, etc + -l LOG_FILE, --log_file LOG_FILE + Path to log file + -c, --check_empty Whether to check if existing files are empty + --ignore Whether to ignore UTF-8 decoding errors + --pool {process,thread} + Whether to use multi-processing or multi-threading +``` diff --git a/get_data.py b/get_data.py index 5012e68..0ebdf8d 100644 --- a/get_data.py +++ b/get_data.py @@ -5,7 +5,7 @@ M. Gerlach & F. Font-Clos """ -from src.utils import populate_raw_from_mirror, list_duplicates_in_mirror +from src.utils import populate_raw_from_mirror, list_duplicates_in_mirror, remove_empty_dirs, is_win32 from src.metadataparser import make_df_metadata from src.bookshelves import get_bookshelves from src.bookshelves import parse_bookshelves @@ -22,6 +22,7 @@ "This script will download all books currently not in your\n" "local copy of PG and get the latest version of the metadata.\n" ) + # mirror dir parser.add_argument( "-m", "--mirror", @@ -68,16 +69,50 @@ action="store_true", help="Quiet mode, do not print info, warnings, etc" ) + + # clean argument, to supress info + parser.add_argument( + "-c", "--clean", + action="store_true", + help="Clean the mirror directory to remove any empty folders" + ) + + # rsync command + parser.add_argument( + "--rsync", + help="Specify an alternative rsync command", + default='rsync', + type=str) + + # rsync command + parser.add_argument( + "--procedures", + help='''Procedures to go through, defaults to \"pdlmbs\": + [p]ull mirror files; + find [d]uplicates; + hard [l]ink from mirror to raw; + get [m]etadata; + get [b]ookshelf information; + [s]tore bookshelf information''', + default='pdlmbs', + type=str) # create the parser args = parser.parse_args() + mirror_dir, raw_dir, metadata_dir = args.mirror, args.raw, args.metadata + + if is_win32: + print("Windows detected, please make sure wget is installed and added to PATH") + mirror_dir = mirror_dir.replace('/', '\\') + raw_dir = raw_dir.replace('/', '\\') + metadata_dir = metadata_dir.replace('/', '\\') # check that all dirs exist - if not os.path.isdir(args.mirror): + if not os.path.isdir(mirror_dir): raise ValueError("The specified mirror directory does not exist.") - if not os.path.isdir(args.raw): + if not os.path.isdir(raw_dir): raise ValueError("The specified raw directory does not exist.") - if not os.path.isdir(args.metadata): + if not os.path.isdir(metadata_dir): raise ValueError("The specified metadata directory does not exist.") # Update the .mirror directory via rsync @@ -99,49 +134,61 @@ # + 12345 - 0 . t x t #--------------------------------------------- # [.-][t0][x.]t[x.] * [t8] - sp_args = ["rsync", "-am%s" % vstring, - "--include", "*/", - "--include", "[p123456789][g0123456789]%s[.-][t0][x.]t[x.]*[t8]" % args.pattern, - "--exclude", "*", - "aleph.gutenberg.org::gutenberg", args.mirror - ] - subprocess.call(sp_args) + includes = ["*/", "[p123456789][g0123456789]%s[.-][t0][x.]t[x.]*[t8]" % args.pattern] + excludes = ["*"] + sp_args = ' '.join([args.rsync, "-am%s" % vstring] + ["--include=\"%s\"" % i for i in includes] + \ + ["--exclude=\"%s\"" % i for i in excludes] + ["aleph.gutenberg.org::gutenberg", mirror_dir]) + + # If specified, remove any empty directory that might be caused by bugs or wrong patterns in rsync + if args.clean: + remove_empty_dirs(mirror_dir, args.quiet) + + # Subprocess call (default arguments): + # rsync -amv --include="*/" --include="[p123456789][g0123456789]*[.-][t0][x.]t[x.]*[t8]" --exclude="*" aleph.gutenberg.org::gutenberg data/.mirror/ + if 'p' in args.procedures: + subprocess.call(sp_args) # Get rid of duplicates # --------------------- # A very small portion of books are stored more than # once in PG's site. We keep the newest one, see # erase_duplicates_in_mirror docstring. - dups_list = list_duplicates_in_mirror(mirror_dir=args.mirror) + dups_list = list_duplicates_in_mirror(mirror_dir=mirror_dir) if 'd' in args.procedures else [] # Populate raw from mirror # ------------------------ # We populate 'raw_dir' hardlinking to # the hidden 'mirror_dir'. Names are standarized # into PG12345_raw.txt form. - populate_raw_from_mirror( - mirror_dir=args.mirror, - raw_dir=args.raw, - overwrite=args.overwrite_raw, - dups_list=dups_list, - quiet=args.quiet + if 'l' in args.procedures: + populate_raw_from_mirror( + mirror_dir=mirror_dir, + raw_dir=raw_dir, + overwrite=args.overwrite_raw, + dups_list=dups_list, + quiet=args.quiet ) # Update metadata # --------------- # By default, update the whole metadata csv # file each time new data is downloaded. - make_df_metadata( - path_xml=os.path.join(args.metadata, 'rdf-files.tar.bz2'), - path_out=os.path.join(args.metadata, 'metadata.csv'), - update=args.keep_rdf + if 'm' in args.procedures: + make_df_metadata( + path_xml=os.path.join(metadata_dir, 'rdf-files.tar.bz2'), + path_out=os.path.join(metadata_dir, 'metadata.csv'), + update=args.keep_rdf ) # Bookshelves # ----------- # Get bookshelves and their respective books and titles as dicts - BS_dict, BS_num_to_category_str_dict = parse_bookshelves() - with open("metadata/bookshelves_ebooks_dict.pkl", 'wb') as fp: - pickle.dump(BS_dict, fp) - with open("metadata/bookshelves_categories_dict.pkl", 'wb') as fp: - pickle.dump(BS_num_to_category_str_dict, fp) \ No newline at end of file + if 'b' in args.procedures: + get_bookshelves() + + if 's' in args.procedures: + BS_dict, BS_num_to_category_str_dict = parse_bookshelves() + with open("metadata/bookshelves_ebooks_dict.pkl", 'wb') as fp: + pickle.dump(BS_dict, fp) + with open("metadata/bookshelves_categories_dict.pkl", 'wb') as fp: + pickle.dump(BS_num_to_category_str_dict, fp) \ No newline at end of file diff --git a/process_data.py b/process_data.py index e21b92f..bdfc75a 100644 --- a/process_data.py +++ b/process_data.py @@ -11,12 +11,18 @@ import glob import ast import pandas as pd +import concurrent.futures +import io +import re from src.pipeline import process_book -from src.utils import get_langs_dict +from src.utils import get_langs_dict, check_not_empty, is_win32 if __name__ == '__main__': + from src.tokenizer import nltk_dir + import nltk + nltk.download("punkt", nltk_dir) # avoid lookup error parser = argparse.ArgumentParser( "Processing raw texts from Project Gutenberg:" @@ -48,7 +54,7 @@ # pattern to specify subset of books parser.add_argument( "-p", "--pattern", - help="Patttern to specify a subset of books", + help="Pattern to specify a subset of books", default='*', type=str) @@ -65,20 +71,48 @@ help="Path to log file", default=".log", type=str) + + # check if existing files are empty + parser.add_argument( + "-c", "--check_empty", + action="store_true", + help="Whether to check if existing files are empty") + + # whether to ignore UTF-8 decoding errors + parser.add_argument( + "--ignore", + action="store_true", + help="Whether to ignore UTF-8 decoding errors") + + # multi-threading/processing choice + parser.add_argument( + "--pool", + help="Whether to use multi-processing or multi-threading", + default="process", + choices=["process", "thread"], + type=str) # add arguments to parser args = parser.parse_args() + raw_dir, text_dir, tokens_dir, counts_dir = args.raw, args.output_text, args.output_tokens, args.output_counts + + if is_win32: + print("Windows detected") + raw_dir = raw_dir.replace('/', '\\') + text_dir = text_dir.replace('/', '\\') + tokens_dir = tokens_dir.replace('/', '\\') + counts_dir = counts_dir.replace('/', '\\') # check whether the out-put directories exist - if os.path.isdir(args.output_text) is False: + if os.path.isdir(text_dir) is False: raise ValueError("The directory for output of texts '%s' " - "does not exist" % (args.output_text)) - if os.path.isdir(args.output_tokens) is False: + "does not exist" % (text_dir)) + if os.path.isdir(tokens_dir) is False: raise ValueError("The directory for output of tokens '%s' " - "does not exist" % (args.output_tokens)) - if os.path.isdir(args.output_counts) is False: + "does not exist" % (tokens_dir)) + if os.path.isdir(counts_dir) is False: raise ValueError("The directory for output of counts '%s' " - "does not exist" % (args.output_counts)) + "does not exist" % (counts_dir)) # load metadata metadata = pd.read_csv("metadata/metadata.csv").set_index("id") @@ -88,40 +122,114 @@ # loop over all books in the raw-folder pbooks = 0 - for filename in glob.glob(join(args.raw, 'PG%s_raw.txt' % (args.pattern))): - # The process_books function will fail very rarely, whne - # a file tagged as UTf-8 is not really UTF-8. We kust - # skip those books. - try: - # get PG_id - PG_id = filename.split("/")[-1].split("_")[0] - - # get language from metadata - # default is english - language = "english" - # language is a string representing a list of languages codes - lang_id = ast.literal_eval(metadata.loc[PG_id, "language"])[0] - if lang_id in langs_dict.keys(): - language = langs_dict[lang_id] - - # process the book: strip headers, tokenize, count - process_book( - path_to_raw_file=filename, - text_dir=args.output_text, - tokens_dir=args.output_tokens, - counts_dir=args.output_counts, - language=language, - log_file=args.log_file - ) - pbooks += 1 + + # find out which jobs were already done + re_pattern = args.pattern.replace('*', '.*') # wild card roughly equals .* in regex + pattern_text = re.compile('(PG%s)_text.txt' % (re_pattern)) + pattern_tokens = re.compile('(PG%s)_tokens.txt' % (re_pattern)) + pattern_counts = re.compile('(PG%s)_counts.txt' % (re_pattern)) + exist_text = {pattern_text.fullmatch(f) for f in + glob.glob('PG%s_text.txt' % (args.pattern), root_dir=text_dir)} + exist_tokens = {pattern_tokens.fullmatch(f) for f in + glob.glob('PG%s_tokens.txt' % (args.pattern), root_dir=tokens_dir)} + exist_counts = {pattern_counts.fullmatch(f) for f in + glob.glob('PG%s_counts.txt' % (args.pattern), root_dir=counts_dir)} + + exist_text = {f.group(1) for f in exist_text if f} + exist_tokens = {f.group(1) for f in exist_tokens if f} + exist_counts = {f.group(1) for f in exist_counts if f} + done_jobs = exist_text & exist_tokens & exist_counts + del exist_text, exist_tokens, exist_counts + + if args.check_empty: + with eval("concurrent.futures.%sPoolExecutor()" % args.pool.capitalize()) as pool: + tmp0 = [(os.path.join(text_dir, PG_id) + "_text.txt", PG_id) for PG_id in done_jobs] + tmp1 = [(os.path.join(tokens_dir, PG_id) + "_tokens.txt", PG_id) for PG_id in done_jobs] + tmp2 = [(os.path.join(counts_dir, PG_id) + "_counts.txt", PG_id) for PG_id in done_jobs] + validate_jobs0 = {pool.submit(check_not_empty, f) : PG_id for f, PG_id in tmp0} + validate_jobs1 = {pool.submit(check_not_empty, f) : PG_id for f, PG_id in tmp1} + validate_jobs2 = {pool.submit(check_not_empty, f) : PG_id for f, PG_id in tmp2} + validation_results = {PG_id : 0 for PG_id in done_jobs} if not args.quiet: - print("Processed %d books..." % pbooks, end="\r") - except UnicodeDecodeError: + print("%d books to check for completion (3 passes required)" % len(done_jobs)) + for job_type in [validate_jobs0, validate_jobs1, validate_jobs2]: + pbooks = 0 + for job in concurrent.futures.as_completed(job_type): + if job.result(): + validation_results[job_type[job]] += 1 + pbooks += 1 + if (not args.quiet) and (pbooks % 100 == 0): + print("%6d books checked for completion" % pbooks, end="\r") + done_jobs = {PG_id for PG_id in validation_results if validation_results[PG_id] == 3} if not args.quiet: - print("# WARNING: cannot process '%s' (encoding not UTF-8)" % filename) - except KeyError: + print("%d books seem to be processed but have empty file(s)" %(len(validation_results) - len(done_jobs))) + del tmp0, tmp1, tmp2, validate_jobs0, validate_jobs1, validate_jobs2, validation_results + + with eval("concurrent.futures.%sPoolExecutor()" % args.pool.capitalize()) as pool: + book_process_jobs = dict() + for filename in glob.glob(join(raw_dir, 'PG%s_raw.txt' % (args.pattern))): + # The process_books function will fail very rarely, whne + # a file tagged as UTf-8 is not really UTF-8. We kust + # skip those books. + # get PG_id + PG_id = os.path.split(filename)[-1].split("_")[0] + + if PG_id not in done_jobs: + # get language from metadata + # default is english + language = "english" + try: + # language is a string representing a list of languages codes + lang_id = ast.literal_eval(metadata.loc[PG_id, "language"])[0] + if lang_id in langs_dict.keys(): + language = langs_dict[lang_id] + except KeyError: + if not args.quiet: + msg = "# WARNING: metadata for '%s' not found" % filename + print(msg) + if args.log_file: + with io.open(args.log_file, "a") as f: + f.write(msg + '\n') + + # process the book: strip headers, tokenize, count + book_process_jobs[ + pool.submit( + process_book, + path_to_raw_file=filename, + text_dir=text_dir, + tokens_dir=tokens_dir, + counts_dir=counts_dir, + overwrite_all=True, + language=language, + log_file=args.log_file, + ignore=args.ignore) + ] = PG_id + + pbooks += 1 if not args.quiet: - print("# WARNING: metadata for '%s' not found" % filename) - except Exception as e: + print("%d book processing jobs started..." % pbooks, end="\r") + + print("\n%d book processing jobs created in total" % len(book_process_jobs)) + + pbooks = 0 + for job in concurrent.futures.as_completed(book_process_jobs): + PG_id = book_process_jobs[job] + if args.log_file: + try: + log_content = job.result() + with io.open(args.log_file, "a") as f: + f.write(log_content) + except UnicodeDecodeError: + if not args.quiet: + print("# WARNING: cannot process '%s' (encoding not UTF-8)" % PG_id) + except LookupError as e: + print("Very likely that an NLTK resource needs to be downloaded") + raise e + except Exception as e: + if not args.quiet: + print("# WARNING: cannot process '%s' (unkown error)" % PG_id) + raise e + pbooks += 1 if not args.quiet: - print("# WARNING: cannot process '%s' (unkown error)" % filename) + print("Processed %d books..." % pbooks, end="\r") + print("\ndone") diff --git a/src/bookshelves.py b/src/bookshelves.py index 62e31aa..791b31c 100644 --- a/src/bookshelves.py +++ b/src/bookshelves.py @@ -7,7 +7,19 @@ import pandas as pd import lxml.html import subprocess +import shutil +from .utils import is_win32 +def rm_dir(*args): + subprocess.call(["rm", "-rf"] + args) + +if is_win32: + rm_dir = shutil.rmtree + def rm_pattern(path): + for file in glob.glob(path.replace('/', os.path.sep)): + os.remove(file) +else: + rm_pattern = rm_dir def get_bookshelves(): """ @@ -19,8 +31,8 @@ def get_bookshelves(): """ sp_args = ["wget", - "--random-wait", "-r", - "-p", "--no-parent", + "--random-wait", "-r", + "--no-parent", "-e", "robots=off", "-U", "mozilla", "https://www.gutenberg.org/ebooks/bookshelf/" @@ -28,21 +40,25 @@ def get_bookshelves(): subprocess.call(sp_args) # move it to metadata dir - sp_args = "mv www.gutenberg.org/ebooks/bookshelf/* metadata/bookshelves_html/" - subprocess.call(sp_args, shell=True) + if is_win32: + dst = "metadata/bookshelves_html/" + if not os.path.exists(dst): + os.mkdir(dst) + for src_path in glob.glob("www.gutenberg.org/ebooks/bookshelf/*"): + shutil.move(src_path, dst) + else: + sp_args = "mv www.gutenberg.org/ebooks/bookshelf/* metadata/bookshelves_html/" + subprocess.call(sp_args, shell=True) # cleanup - sp_args = ["rm", "-rf", "www.gutenberg.org"] - subprocess.call(sp_args) + if os.path.exists("www.gutenberg.org"): + rm_dir("www.gutenberg.org") # in the new version of the website and with these parameters of the wget (gets also other links within the crawled page) # we get also other files, copy of the bookshelves but with different ordering # remove them - sp_args = ["rm", "-rf", "metadata/bookshelves_html/*.opds*"] - subprocess.call(sp_args) - sp_args = ["rm", "-rf", "metadata/bookshelves_html/*?sort*"] - subprocess.call(sp_args) - sp_args = ["rm", "-rf", "metadata/bookshelves_html/*?start*"] - subprocess.call(sp_args) + rm_pattern("metadata/bookshelves_html/*.opds*") + rm_pattern("metadata/bookshelves_html/*?sort*") + rm_pattern("metadata/bookshelves_html/*?start*") return None def parse_bookshelves(): @@ -57,21 +73,23 @@ def parse_bookshelves(): """ # parse the data BS_paths = glob.glob("metadata/bookshelves_html/*") - BS = [path.split("/")[-1] for path in BS_paths] + # BS = [os.path.split(path)[-1] for path in BS_paths] BS_dict = {} BS_num_to_category_str_dict = {} for path in BS_paths: - bs = path.split("/")[-1] + _, bs = os.path.split(path) + PG_header = '' if bs == "index.html" else "PG" BS_dict[bs] = [] with open(path, "r", encoding="UTF-8") as foo: - dom = lxml.html.fromstring(foo.read()) + dom = lxml.html.parse(path) # select the url in href for all a tags(links) for link in dom.xpath('//a/@href'): # links to ebooks that are not searches if link.find("ebooks") > -1 and link.find("search") == -1: - PGid = "PG"+link.split("/")[-1] - BS_dict[bs].append(PGid) + book_id = link.split("/")[-1] + if book_id.isdigit(): + BS_dict[bs].append(PG_header + book_id) # get title of the category title_categories = dom.findall('.//title') # './/title' finds recursively the element with tag 'title' # check if there is only one title in the metadata of the category diff --git a/src/cleanup.py b/src/cleanup.py index 69967f6..90cb232 100644 --- a/src/cleanup.py +++ b/src/cleanup.py @@ -16,7 +16,7 @@ def cleanup(path, text_dir): Path to the PG****_raw.txt file """ - PG_number = path.split("/")[-1].split("_")[0][2:] + PG_number = os.path.split(path)[-1].split("_")[0][2:] with io.open(path) as f: text = f.read() diff --git a/src/pipeline.py b/src/pipeline.py index 5e89c56..ccc86aa 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -14,7 +14,8 @@ def process_book( cleanup_f=strip_headers, overwrite_all=False, language="english", - log_file="" + log_file="", + ignore=False ): """ Process a book, from raw data to counts. @@ -39,6 +40,9 @@ def process_book( ---------- overwrite_all : bool If set to True, everything is processed regargless of existing files. + ignore : bool + If set to True, ignores UTF-8 decoding errors for "technically UTF-8" codecs + such as Windows-1252, enabling this shouldn't lead to the loss of any token """ if text_dir is None: raise ValueError("You must specify a path to save the text files.") @@ -53,14 +57,15 @@ def process_book( raise ValueError("You must specify a path to the raw file to process.") # get PG number - PG_number = path_to_raw_file.split("/")[-1].split("_")[0][2:] + PG_number = os.path.split(path_to_raw_file)[-1].split("_")[0][2:] if overwrite_all or\ (not os.path.isfile(os.path.join(text_dir,"PG%s_text.txt"%PG_number))) or \ (not os.path.isfile(os.path.join(tokens_dir,"PG%s_tokens.txt"%PG_number))) or \ (not os.path.isfile(os.path.join(counts_dir,"PG%s_counts.txt"%PG_number))): # read raw file - with io.open(path_to_raw_file, encoding="UTF-8") as f: + with io.open(path_to_raw_file, encoding="UTF-8", + errors="ignore" if ignore else "strict") as f: text = f.read() # clean it up @@ -93,6 +98,5 @@ def process_book( clean_nl = clean.count("\n") L = len(tokens) V = len(counts) - with io.open(log_file, "a") as f: - f.write("PG"+str(PG_number)+"\t"+language+"\t"+str(raw_nl)+"\t"+str(clean_nl)+"\t"+str(L)+"\t"+str(V)+"\n") + return "PG"+str(PG_number)+"\t"+language+"\t"+str(raw_nl)+"\t"+str(clean_nl)+"\t"+str(L)+"\t"+str(V)+"\n" diff --git a/src/tokenizer.py b/src/tokenizer.py index b25f21e..873fb9c 100644 --- a/src/tokenizer.py +++ b/src/tokenizer.py @@ -3,9 +3,10 @@ Call tokenize and pass a text (i.e. as a string). You will get a list of tokens """ +nltk_dir = "src/nltk_data" import nltk -nltk.data.path=["src/nltk_data"] +nltk.data.path=[nltk_dir] from nltk.tokenize.treebank import TreebankWordTokenizer from nltk.tokenize import sent_tokenize @@ -42,7 +43,7 @@ def tokenize_text(text, language="english"): def filter_tokens(list_tokens): '''Remove un-wanted tokens from list of tokens - We only keep words that return TRUE for string.isaplha() + We only keep words that return TRUE for string.isalpha() We lowercase every token with string.lower() ''' list_tokens_filter = [h.lower() for h in list_tokens if h.isalpha()] diff --git a/src/utils.py b/src/utils.py index ceea2ea..6b5a8cd 100644 --- a/src/utils.py +++ b/src/utils.py @@ -3,6 +3,11 @@ import shutil import subprocess import glob +import io +from sys import platform + +# add support for windows +is_win32 = platform.casefold() == "win32" def get_langs_dict(): """ @@ -64,7 +69,7 @@ def list_duplicates_in_mirror( dups_list = [] for dirName, subdirList, fileList in os.walk(mirror_dir): for matchpath in glob.iglob(os.path.join(dirName,"*-0.txt")): - fname = matchpath.split("/")[-1] + _, fname = os.path.split(matchpath) # fname must have exactly one "." and one "-" if (len(fname.split("."))==2 and len(fname.split("-"))==2): PGnumber = get_PG_number(fname) @@ -97,10 +102,24 @@ def populate_raw_from_mirror(mirror_dir=None, Files in this list are not copied into raw. """ - for dirName, subdirList, fileList in os.walk(mirror_dir): + + # for non-Windows environments + def hard_link(src: str, tgt: str): + if (not os.path.isfile(tgt)) or overwrite: + subprocess.call(["ln", "-f", src, tgt]) + + # for Windows environments + if is_win32: + def win_hard_link(src: str, tgt: str): + if os.path.isfile(tgt) and overwrite: + subprocess.call(["del", tgt]) + subprocess.call("mklink /H %s %s" % (tgt, src), shell=True) + hard_link = win_hard_link + + for dirName, _, _ in os.walk(mirror_dir): # patterns to match are 12345-0.txt or pg12345.txt.utf8 for matchpath in glob.iglob(os.path.join(dirName, "[p123456789][g0123456789][0-9]*")): - fname = matchpath.split("/")[-1] + _, fname = os.path.split(matchpath) # check that file is not in dups_list if matchpath not in dups_list: # avoid files with more "." or "-" than expected @@ -112,12 +131,38 @@ def populate_raw_from_mirror(mirror_dir=None, source = os.path.join(dirName, fname) target = os.path.join(raw_dir, "PG"+PGnumber+"_raw.txt") - if (not os.path.isfile(target)) or overwrite: - subprocess.call(["ln", "-f", source, target]) + hard_link(source, target) # if file was not in dupes list and we are not quiet elif not quiet: print("# WARNING: file %s skipped due to duplication" % fname) +def remove_empty_dirs(path: str, quiet: bool=False): + """ + Removes empty directories in specified path + Parameters + ---------- + path : str + the path to clean + quiet : bool + whether to notify the deletion + """ + # Check if the given path is a directory + if not os.path.isdir(path): + print(f"Error: {path} is not a valid directory.") + return + + # Recursively remove empty subdirectories + for dirName, subdirList, _ in os.walk(path, topdown=False): + for subdir in subdirList: + subdir_path = os.path.join(dirName, subdir) + if not os.listdir(subdir_path): # Check if the directory is empty + os.rmdir(subdir_path) # Remove the empty directory + if not quiet: + print(f"Removed empty directory: {subdir_path}") + +def check_not_empty(fname: str) -> bool: + with io.open(fname, errors="ignore", encoding="utf-8") as f: + return bool(f.read().strip())