From 2e455e44d9523a2facc0318b8937c687209cec37 Mon Sep 17 00:00:00 2001
From: Hugo^3 <pondypondo@outlook.com>
Date: Thu, 28 Dec 2023 21:54:55 +0000
Subject: [PATCH 01/20] Added argument for customization of rsync command,
 changed rsync arguments to use proper formats

---
 .gitignore  |  7 ++++++-
 get_data.py | 15 +++++++++++----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5f5032e..e5819c8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -108,4 +108,9 @@ ENV/
 .mypy_cache/
 
 # VScode
-.vscode/
\ No newline at end of file
+.vscode/
+
+# Windows dependencies / batch files
+cwRsync*/*
+*.exe
+*.bat
\ No newline at end of file
diff --git a/get_data.py b/get_data.py
index 5012e68..9600467 100644
--- a/get_data.py
+++ b/get_data.py
@@ -68,6 +68,13 @@
         action="store_true",
         help="Quiet mode, do not print info, warnings, etc"
         )
+    
+    # rsync command
+    parser.add_argument(
+        "--rsync",
+        help="Specify rsync command if not `rsync`",
+        default='rsync',
+        type=str)
 
     # create the parser
     args = parser.parse_args()
@@ -99,10 +106,10 @@
     # + 12345 -   0   .  t x                 t 
     #---------------------------------------------
     #        [.-][t0][x.]t[x.]    *         [t8]
-    sp_args = ["rsync", "-am%s" % vstring,
-               "--include", "*/",
-               "--include", "[p123456789][g0123456789]%s[.-][t0][x.]t[x.]*[t8]" % args.pattern,
-               "--exclude", "*",
+    sp_args = [args.rsync, "-am%s" % vstring,
+               "--include=*/",
+               "--include=[p123456789][g0123456789]%s[.-][t0][x.]t[x.]*[t8]" % args.pattern,
+               "--exclude=*",
                "aleph.gutenberg.org::gutenberg", args.mirror
                ]
     subprocess.call(sp_args)

From e424d9536cf561c5e6eb48a4500bb37fab0a2f4b Mon Sep 17 00:00:00 2001
From: Hugo^3 <pondypondo@outlook.com>
Date: Sat, 30 Dec 2023 02:26:12 +0000
Subject: [PATCH 02/20] Added Win32 support and a function for cleaning empty
 directories

---
 src/utils.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 5 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index ceea2ea..b8ed09d 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -3,6 +3,10 @@
 import shutil
 import subprocess
 import glob
+from sys import platform
+
+# add support for windows
+is_win32 = platform.casefold() == "win32"
 
 def get_langs_dict():
     """
@@ -64,7 +68,7 @@ def list_duplicates_in_mirror(
     dups_list = []
     for dirName, subdirList, fileList in os.walk(mirror_dir):
         for matchpath in glob.iglob(os.path.join(dirName,"*-0.txt")):
-            fname = matchpath.split("/")[-1]
+            _, fname = os.path.split(matchpath)
             # fname must have exactly one "." and one "-"
             if (len(fname.split("."))==2 and len(fname.split("-"))==2):
                 PGnumber = get_PG_number(fname)
@@ -97,10 +101,24 @@ def populate_raw_from_mirror(mirror_dir=None,
         Files in this list are not copied into raw.
 
     """
-    for dirName, subdirList, fileList in os.walk(mirror_dir):
+    
+    # for non-Windows environments
+    def hard_link(src: str, tgt: str):
+        if (not os.path.isfile(tgt)) or overwrite:
+            subprocess.call(["ln", "-f", src, tgt])
+
+    # for Windows environments
+    if is_win32:
+        def win_hard_link(src: str, tgt: str):
+            if os.path.isfile(tgt) and overwrite:
+                subprocess.call(["del", tgt])
+            subprocess.call("mklink /H %s %s" % (tgt, src), shell=True)
+        hard_link = win_hard_link
+
+    for dirName, _, _ in os.walk(mirror_dir):
         # patterns to match are 12345-0.txt or pg12345.txt.utf8
         for matchpath in glob.iglob(os.path.join(dirName, "[p123456789][g0123456789][0-9]*")):
-            fname = matchpath.split("/")[-1]
+            _, fname = os.path.split(matchpath)
             # check that file is not in dups_list
             if matchpath not in dups_list:
                 # avoid files with more "." or "-" than expected
@@ -112,12 +130,36 @@ def populate_raw_from_mirror(mirror_dir=None,
                     source = os.path.join(dirName, fname)
                     target = os.path.join(raw_dir, "PG"+PGnumber+"_raw.txt")
 
-                    if (not os.path.isfile(target)) or overwrite:
-                        subprocess.call(["ln", "-f", source, target])
+                    hard_link(source, target)
 
             # if file was not in dupes list and we are not quiet
             elif not quiet:
                 print("# WARNING: file %s skipped due to duplication" % fname)
 
+def remove_empty_dirs(path: str, quiet: bool=False):
+    """
+    Removes empty directories in specified path
+
+    Parameters
+    ----------
+    path : str
+        the path to clean
+    quiet : bool
+        whether to notify the deletion
+
+    """
+    # Check if the given path is a directory
+    if not os.path.isdir(path):
+        print(f"Error: {path} is not a valid directory.")
+        return
+
+    # Recursively remove empty subdirectories
+    for dirName, subdirList, _ in os.walk(path, topdown=False):
+        for subdir in subdirList:
+            subdir_path = os.path.join(dirName, subdir)
+            if not os.listdir(subdir_path):  # Check if the directory is empty
+                os.rmdir(subdir_path)        # Remove the empty directory
+                if not quiet:
+                    print(f"Removed empty directory: {subdir_path}")
 
 

From eabe9810ce583dfdf3ba70e8912e7cfce0f6918f Mon Sep 17 00:00:00 2001
From: Hugo^3 <pondypondo@outlook.com>
Date: Sat, 30 Dec 2023 02:28:11 +0000
Subject: [PATCH 03/20] Fixed typo, exposing nltk_data dir as string variable

---
 src/tokenizer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/tokenizer.py b/src/tokenizer.py
index b25f21e..873fb9c 100644
--- a/src/tokenizer.py
+++ b/src/tokenizer.py
@@ -3,9 +3,10 @@
    Call tokenize and pass a text (i.e. as a string).
    You will get a list of tokens
 """
+nltk_dir = "src/nltk_data"
 
 import nltk
-nltk.data.path=["src/nltk_data"]
+nltk.data.path=[nltk_dir]
 
 from nltk.tokenize.treebank import TreebankWordTokenizer
 from nltk.tokenize import sent_tokenize
@@ -42,7 +43,7 @@ def tokenize_text(text, language="english"):
 
 def filter_tokens(list_tokens):
     '''Remove un-wanted tokens from list of tokens
-    We only keep words that return TRUE for string.isaplha()
+    We only keep words that return TRUE for string.isalpha()
     We lowercase every token with string.lower()
     '''
     list_tokens_filter = [h.lower() for h in list_tokens if h.isalpha()]

From f759ae0187d2e2cab56c3abaa6785480be85cc3d Mon Sep 17 00:00:00 2001
From: Hugo^3 <pondypondo@outlook.com>
Date: Sat, 30 Dec 2023 02:30:50 +0000
Subject: [PATCH 04/20] Used OS-independent path-parsing, changed behavior of
 process_book function to return logging message instead of appending to log
 file directly

---
 src/pipeline.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/pipeline.py b/src/pipeline.py
index 5e89c56..3d290ba 100644
--- a/src/pipeline.py
+++ b/src/pipeline.py
@@ -53,7 +53,7 @@ def process_book(
         raise ValueError("You must specify a path to the raw file to process.")
    
     # get PG number
-    PG_number = path_to_raw_file.split("/")[-1].split("_")[0][2:]
+    PG_number = os.path.split(path_to_raw_file)[-1].split("_")[0][2:]
 
     if overwrite_all or\
         (not os.path.isfile(os.path.join(text_dir,"PG%s_text.txt"%PG_number))) or \
@@ -93,6 +93,5 @@ def process_book(
             clean_nl = clean.count("\n")
             L = len(tokens)
             V = len(counts)
-            with io.open(log_file, "a") as f:
-               f.write("PG"+str(PG_number)+"\t"+language+"\t"+str(raw_nl)+"\t"+str(clean_nl)+"\t"+str(L)+"\t"+str(V)+"\n")
+            return "PG"+str(PG_number)+"\t"+language+"\t"+str(raw_nl)+"\t"+str(clean_nl)+"\t"+str(L)+"\t"+str(V)+"\n"
                 

From 5e3ea272281f1612fdd982b5f06076d3767693b1 Mon Sep 17 00:00:00 2001
From: Hugo^3 <pondypondo@outlook.com>
Date: Sat, 30 Dec 2023 02:32:05 +0000
Subject: [PATCH 05/20] Used OS-independent path-parsing

---
 src/cleanup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cleanup.py b/src/cleanup.py
index 69967f6..90cb232 100644
--- a/src/cleanup.py
+++ b/src/cleanup.py
@@ -16,7 +16,7 @@ def cleanup(path, text_dir):
         Path to the PG****_raw.txt file
 
     """
-    PG_number = path.split("/")[-1].split("_")[0][2:]
+    PG_number = os.path.split(path)[-1].split("_")[0][2:]
     with io.open(path) as f:
         text = f.read()
 

From be5cb201fdb5e59dfc5bbcbff864413e75d3ff29 Mon Sep 17 00:00:00 2001
From: Hugo^3 <pondypondo@outlook.com>
Date: Sat, 30 Dec 2023 02:36:06 +0000
Subject: [PATCH 06/20] Added Win32 support

---
 src/bookshelves.py | 39 +++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/src/bookshelves.py b/src/bookshelves.py
index 62e31aa..7ed8b2f 100644
--- a/src/bookshelves.py
+++ b/src/bookshelves.py
@@ -7,7 +7,19 @@
 import pandas as pd
 import lxml.html
 import subprocess
+import shutil
+from .utils import is_win32
 
+def rm_dir(*args):
+    subprocess.call(["rm", "-rf"] + args)
+
+rm_pattern = rm_dir
+
+if is_win32:
+    rm_dir = os.rmdir
+    def rm_pattern(path):
+        for file in glob.glob(path):
+            os.remove(file)
 
 def get_bookshelves():
     """
@@ -28,21 +40,24 @@ def get_bookshelves():
     subprocess.call(sp_args)
 
     # move it to metadata dir
-    sp_args = "mv www.gutenberg.org/ebooks/bookshelf/* metadata/bookshelves_html/"
-    subprocess.call(sp_args, shell=True)
+    if is_win32:
+        dst = "metadata/bookshelves_html/"
+        if not os.path.exists(dst):
+            os.mkdir(dst)
+        for src_path in glob.glob("www.gutenberg.org/ebooks/bookshelf/*"):
+            shutil.move(src_path, dst)
+    else:
+        sp_args = "mv www.gutenberg.org/ebooks/bookshelf/* metadata/bookshelves_html/"
+        subprocess.call(sp_args, shell=True)
 
     # cleanup
-    sp_args = ["rm", "-rf", "www.gutenberg.org"]
-    subprocess.call(sp_args)
+    rm_dir("www.gutenberg.org")
     # in the new version of the website and with these parameters of the wget (gets also other links within the crawled page)
     # we get also other files, copy of the bookshelves but with different ordering
     # remove them
-    sp_args = ["rm", "-rf", "metadata/bookshelves_html/*.opds*"]
-    subprocess.call(sp_args)
-    sp_args = ["rm", "-rf", "metadata/bookshelves_html/*?sort*"]
-    subprocess.call(sp_args)
-    sp_args = ["rm", "-rf", "metadata/bookshelves_html/*?start*"]
-    subprocess.call(sp_args)
+    rm_pattern("metadata/bookshelves_html/*.opds*")
+    rm_pattern("metadata/bookshelves_html/*?sort*")
+    rm_pattern("metadata/bookshelves_html/*?start*")
     return None
 
 def parse_bookshelves():
@@ -57,12 +72,12 @@ def parse_bookshelves():
     """
     # parse the data
     BS_paths = glob.glob("metadata/bookshelves_html/*")
-    BS = [path.split("/")[-1] for path in BS_paths]
+    # BS = [os.path.split(path)[-1] for path in BS_paths]
 
     BS_dict = {}
     BS_num_to_category_str_dict = {}
     for path in BS_paths:
-        bs = path.split("/")[-1]
+        _, bs = os.path.split(path)
         BS_dict[bs] = []
         with open(path, "r", encoding="UTF-8") as foo:
             dom = lxml.html.fromstring(foo.read())

From 0e6404d29abbc0b3769027bb99c7031f2e9cc699 Mon Sep 17 00:00:00 2001
From: Hugo^3 <pondypondo@outlook.com>
Date: Sat, 30 Dec 2023 02:39:24 +0000
Subject: [PATCH 07/20] Added Win32 support and freedom to specify the stages
 to go through via an optional `--procedures` argument

---
 get_data.py | 92 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 64 insertions(+), 28 deletions(-)

diff --git a/get_data.py b/get_data.py
index 9600467..5cdc3f0 100644
--- a/get_data.py
+++ b/get_data.py
@@ -5,7 +5,7 @@
 M. Gerlach & F. Font-Clos
 
 """
-from src.utils import populate_raw_from_mirror, list_duplicates_in_mirror
+from src.utils import populate_raw_from_mirror, list_duplicates_in_mirror, remove_empty_dirs, is_win32
 from src.metadataparser import make_df_metadata
 from src.bookshelves import get_bookshelves
 from src.bookshelves import parse_bookshelves
@@ -22,6 +22,7 @@
         "This script will download all books currently not in your\n"
         "local copy of PG and get the latest version of the metadata.\n"
         )
+    
     # mirror dir
     parser.add_argument(
         "-m", "--mirror",
@@ -69,22 +70,48 @@
         help="Quiet mode, do not print info, warnings, etc"
         )
     
+    # clean argument, to supress info
+    parser.add_argument(
+        "-c", "--clean",
+        action="store_true",
+        help="Clean the mirror directory to remove any empty folders"
+        )
+    
     # rsync command
     parser.add_argument(
         "--rsync",
-        help="Specify rsync command if not `rsync`",
+        help="Specify an alternative rsync command",
         default='rsync',
         type=str)
+    
+    # rsync command
+    parser.add_argument(
+        "--procedures",
+        help='''Procedures to go through, defaults to \"pdlmb\": 
+        - [p]ull mirror files
+        - find [d]uplicates
+        - hard [l]ink from mirror to raw
+        - get [m]etadata
+        - get [b]ookshelf information''',
+        default='pdlmb',
+        type=str)
 
     # create the parser
     args = parser.parse_args()
+    mirror_dir, raw_dir, metadata_dir = args.mirror, args.raw, args.metadata
+    
+    if is_win32:
+        print("Windows detected, please make sure wget is installed and added to PATH")
+        mirror_dir = mirror_dir.replace('/', '\\')
+        raw_dir = raw_dir.replace('/', '\\')
+        metadata_dir = metadata_dir.replace('/', '\\')
 
     # check that all dirs exist
-    if not os.path.isdir(args.mirror):
+    if not os.path.isdir(mirror_dir):
         raise ValueError("The specified mirror directory does not exist.")
-    if not os.path.isdir(args.raw):
+    if not os.path.isdir(raw_dir):
         raise ValueError("The specified raw directory does not exist.")
-    if not os.path.isdir(args.metadata):
+    if not os.path.isdir(metadata_dir):
         raise ValueError("The specified metadata directory does not exist.")
 
     # Update the .mirror directory via rsync
@@ -106,49 +133,58 @@
     # + 12345 -   0   .  t x                 t 
     #---------------------------------------------
     #        [.-][t0][x.]t[x.]    *         [t8]
-    sp_args = [args.rsync, "-am%s" % vstring,
-               "--include=*/",
-               "--include=[p123456789][g0123456789]%s[.-][t0][x.]t[x.]*[t8]" % args.pattern,
-               "--exclude=*",
-               "aleph.gutenberg.org::gutenberg", args.mirror
-               ]
-    subprocess.call(sp_args)
+    includes = ["*/", "[p123456789][g0123456789]%s[.-][t0][x.]t[x.]*[t8]" % args.pattern]
+    excludes = ["*"]
+    sp_args = ' '.join([args.rsync, "-am%s" % vstring] + ["--include=\"%s\"" % i for i in includes] + \
+        ["--exclude=\"%s\"" % i for i in excludes] + ["aleph.gutenberg.org::gutenberg", mirror_dir])
+    
+    # If specified, remove any empty directory that might be caused by bugs or wrong patterns in rsync
+    if args.clean:
+        remove_empty_dirs(mirror_dir, args.quiet)
+
+    # Subprocess call (default arguments):
+    # rsync -amv --include="*/" --include="[p123456789][g0123456789]*[.-][t0][x.]t[x.]*[t8]" --exclude="*" aleph.gutenberg.org::gutenberg data/.mirror/
+    if 'p' in args.procedures:
+        subprocess.call(sp_args) 
 
     # Get rid of duplicates
     # ---------------------
     # A very small portion of books are stored more than
     # once in PG's site. We keep the newest one, see
     # erase_duplicates_in_mirror docstring.
-    dups_list = list_duplicates_in_mirror(mirror_dir=args.mirror)
+    dups_list = list_duplicates_in_mirror(mirror_dir=mirror_dir) if 'd' in args.procedures else []
 
     # Populate raw from mirror
     # ------------------------
     # We populate 'raw_dir' hardlinking to
     # the hidden 'mirror_dir'. Names are standarized
     # into PG12345_raw.txt form.
-    populate_raw_from_mirror(
-        mirror_dir=args.mirror,
-        raw_dir=args.raw,
-        overwrite=args.overwrite_raw,
-        dups_list=dups_list,
-        quiet=args.quiet
+    if 'l' in args.procedures:
+        populate_raw_from_mirror(
+            mirror_dir=mirror_dir,
+            raw_dir=raw_dir,
+            overwrite=args.overwrite_raw,
+            dups_list=dups_list,
+            quiet=args.quiet
         )
 
     # Update metadata
     # ---------------
     # By default, update the whole metadata csv
     # file each time new data is downloaded.
-    make_df_metadata(
-        path_xml=os.path.join(args.metadata, 'rdf-files.tar.bz2'),
-        path_out=os.path.join(args.metadata, 'metadata.csv'),
-        update=args.keep_rdf
+    if 'm' in args.procedures:
+        make_df_metadata(
+            path_xml=os.path.join(metadata_dir, 'rdf-files.tar.bz2'),
+            path_out=os.path.join(metadata_dir, 'metadata.csv'),
+            update=args.keep_rdf
         )
 
     # Bookshelves
     # -----------
     # Get bookshelves and their respective books and titles as dicts
-    BS_dict, BS_num_to_category_str_dict = parse_bookshelves()
-    with open("metadata/bookshelves_ebooks_dict.pkl", 'wb') as fp:
-        pickle.dump(BS_dict, fp)
-    with open("metadata/bookshelves_categories_dict.pkl", 'wb') as fp:
-        pickle.dump(BS_num_to_category_str_dict, fp)
\ No newline at end of file
+    if 'b' in args.procedures:
+        BS_dict, BS_num_to_category_str_dict = parse_bookshelves()
+        with open("metadata/bookshelves_ebooks_dict.pkl", 'wb') as fp:
+            pickle.dump(BS_dict, fp)
+        with open("metadata/bookshelves_categories_dict.pkl", 'wb') as fp:
+            pickle.dump(BS_num_to_category_str_dict, fp)
\ No newline at end of file

From 91ef9c0621828c52677f88797908eb5d6895ddbd Mon Sep 17 00:00:00 2001
From: Hugo^3 <pondypondo@outlook.com>
Date: Sat, 30 Dec 2023 02:41:40 +0000
Subject: [PATCH 08/20] Fixed a typo and an oversight regarding nltk data
 download, more customization and multi-threading/processing support in
 progress

---
 process_data.py | 91 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 62 insertions(+), 29 deletions(-)

diff --git a/process_data.py b/process_data.py
index e21b92f..1bbbfa3 100644
--- a/process_data.py
+++ b/process_data.py
@@ -11,12 +11,17 @@
 import glob
 import ast
 import pandas as pd
+import concurrent.futures
+import io
 
 from src.pipeline import process_book
-from src.utils import get_langs_dict
+from src.utils import get_langs_dict, is_win32
 
 
 if __name__ == '__main__':
+    from src.tokenizer import nltk_dir
+    import nltk
+    nltk.download("punkt", nltk_dir) # avoid lookup error
 
     parser = argparse.ArgumentParser(
         "Processing raw texts from Project Gutenberg:"
@@ -48,7 +53,7 @@
     # pattern to specify subset of books
     parser.add_argument(
         "-p", "--pattern",
-        help="Patttern to specify a subset of books",
+        help="Pattern to specify a subset of books",
         default='*',
         type=str)
 
@@ -68,17 +73,25 @@
 
     # add arguments to parser
     args = parser.parse_args()
+    raw_dir, text_dir, tokens_dir, counts_dir = args.raw, args.output_text, args.output_tokens, args.output_counts
+    
+    if is_win32:
+        print("Windows detected")
+        raw_dir = raw_dir.replace('/', '\\')
+        text_dir = text_dir.replace('/', '\\')
+        tokens_dir = tokens_dir.replace('/', '\\')
+        counts_dir = counts_dir.replace('/', '\\')
 
     # check whether the out-put directories exist
-    if os.path.isdir(args.output_text) is False:
+    if os.path.isdir(text_dir) is False:
         raise ValueError("The directory for output of texts '%s' "
-                         "does not exist" % (args.output_text))
-    if os.path.isdir(args.output_tokens) is False:
+                         "does not exist" % (text_dir))
+    if os.path.isdir(tokens_dir) is False:
         raise ValueError("The directory for output of tokens '%s' "
-                         "does not exist" % (args.output_tokens))
-    if os.path.isdir(args.output_counts) is False:
+                         "does not exist" % (tokens_dir))
+    if os.path.isdir(counts_dir) is False:
         raise ValueError("The directory for output of counts '%s' "
-                         "does not exist" % (args.output_counts))
+                         "does not exist" % (counts_dir))
 
     # load metadata
     metadata = pd.read_csv("metadata/metadata.csv").set_index("id")
@@ -88,13 +101,15 @@
 
     # loop over all books in the raw-folder
     pbooks = 0
-    for filename in glob.glob(join(args.raw, 'PG%s_raw.txt' % (args.pattern))):
-        # The process_books function will fail very rarely, whne
-        # a file tagged as UTf-8 is not really UTF-8. We kust
-        # skip those books.
-        try:
+    
+    with concurrent.futures.ProcessPoolExecutor() as pool:
+        book_process_jobs = []
+        for filename in glob.glob(join(raw_dir, 'PG%s_raw.txt' % (args.pattern))):
+            # The process_books function will fail very rarely, whne
+            # a file tagged as UTf-8 is not really UTF-8. We kust
+            # skip those books.
             # get PG_id
-            PG_id = filename.split("/")[-1].split("_")[0]
+            PG_id = os.path.split(filename)[-1].split("_")[0]
 
             # get language from metadata
             # default is english
@@ -105,23 +120,41 @@
                 language = langs_dict[lang_id]
 
             # process the book: strip headers, tokenize, count
-            process_book(
+            book_process_jobs.append(pool.submit(
+                process_book,
                 path_to_raw_file=filename,
-                text_dir=args.output_text,
-                tokens_dir=args.output_tokens,
-                counts_dir=args.output_counts,
+                text_dir=text_dir,
+                tokens_dir=tokens_dir,
+                counts_dir=counts_dir,
                 language=language,
-                log_file=args.log_file
-            )
+                log_file=args.log_file))
+            
             pbooks += 1
             if not args.quiet:
-                print("Processed %d books..." % pbooks, end="\r")
-        except UnicodeDecodeError:
-            if not args.quiet:
-                print("# WARNING: cannot process '%s' (encoding not UTF-8)" % filename)
-        except KeyError:
-            if not args.quiet:
-                print("# WARNING: metadata for '%s' not found" % filename)
-        except Exception as e:
+                print("%d book processing jobs started..." % pbooks, end="\r")
+        
+        print()
+        pbooks = 0
+        for job in concurrent.futures.as_completed(book_process_jobs):
+            if args.log_file:
+                try:
+                    log_content = job.result()
+                    with io.open(args.log_file, "a") as f:
+                        f.write(log_content)
+                except UnicodeDecodeError:
+                    if not args.quiet:
+                        print("# WARNING: cannot process '%s' (encoding not UTF-8)" % filename)
+                except KeyError:
+                    if not args.quiet:
+                        print("# WARNING: metadata for '%s' not found" % filename)
+                except LookupError as e:
+                    print("Very likely that an NLTK resource needs to be downloaded")
+                    raise e
+                except Exception as e:
+                    if not args.quiet:
+                        print("# WARNING: cannot process '%s' (unkown error)" % filename)
+                        raise e
+            pbooks += 1
             if not args.quiet:
-                print("# WARNING: cannot process '%s' (unkown error)" % filename)
+                print("Processed %d books..." % pbooks, end="\r")
+            

From d986607cf947dd53624531bed46fce962452d9b9 Mon Sep 17 00:00:00 2001
From: Hugo^3 <pondypondo@outlook.com>
Date: Sat, 30 Dec 2023 02:51:28 +0000
Subject: [PATCH 09/20] modified:   .gitignore

---
 .gitignore | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index e5819c8..df32ff6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -113,4 +113,7 @@ ENV/
 # Windows dependencies / batch files
 cwRsync*/*
 *.exe
-*.bat
\ No newline at end of file
+*.bat
+
+# nltk data directory
+src/nltk_data/**

From cb4f58b0c62ecc8ee30fbbeb532869b2c474a7d1 Mon Sep 17 00:00:00 2001
From: Hugo^3 <pondypondo@outlook.com>
Date: Sat, 30 Dec 2023 14:51:55 +0000
Subject: [PATCH 10/20] Added option to ignore UTF-8 decoding failures for
 "technically UTF-8" codecs such as Windows-1252

---
 src/pipeline.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/pipeline.py b/src/pipeline.py
index 3d290ba..ccc86aa 100644
--- a/src/pipeline.py
+++ b/src/pipeline.py
@@ -14,7 +14,8 @@ def process_book(
 	cleanup_f=strip_headers,
     overwrite_all=False,
     language="english",
-    log_file=""
+    log_file="",
+    ignore=False
 	):
     """
     Process a book, from raw data to counts.
@@ -39,6 +40,9 @@ def process_book(
     ----------
     overwrite_all : bool
         If set to True, everything is processed regargless of existing files.
+    ignore : bool
+        If set to True, ignores UTF-8 decoding errors for "technically UTF-8" codecs
+        such as Windows-1252, enabling this shouldn't lead to the loss of any token
     """
     if text_dir is None:
         raise ValueError("You must specify a path to save the text files.")
@@ -60,7 +64,8 @@ def process_book(
         (not os.path.isfile(os.path.join(tokens_dir,"PG%s_tokens.txt"%PG_number))) or \
         (not os.path.isfile(os.path.join(counts_dir,"PG%s_counts.txt"%PG_number))):
         # read raw file
-        with io.open(path_to_raw_file, encoding="UTF-8") as f:
+        with io.open(path_to_raw_file, encoding="UTF-8", 
+                     errors="ignore" if ignore else "strict") as f:
             text = f.read()
 
         # clean it up

From f69409559656e944ac368cd41acbaafecdbaee2e Mon Sep 17 00:00:00 2001
From: Hugo^3 <pondypondo@outlook.com>
Date: Sat, 30 Dec 2023 14:58:01 +0000
Subject: [PATCH 11/20] Added detection for books already processed, argument
 for specifying multi-threading or processing, argument for ignoring UTF-8
 decoding failures

---
 process_data.py | 99 +++++++++++++++++++++++++++++++++++--------------
 1 file changed, 71 insertions(+), 28 deletions(-)

diff --git a/process_data.py b/process_data.py
index 1bbbfa3..566f909 100644
--- a/process_data.py
+++ b/process_data.py
@@ -13,6 +13,7 @@
 import pandas as pd
 import concurrent.futures
 import io
+import re
 
 from src.pipeline import process_book
 from src.utils import get_langs_dict, is_win32
@@ -70,6 +71,20 @@
         help="Path to log file",
         default=".log",
         type=str)
+    
+    # whether to ignore UTF-8 decoding errors
+    parser.add_argument(
+        "--ignore",
+        action="store_true",
+        help="Whether to ignore UTF-8 decoding errors")
+    
+    # multi-threading/processing choice
+    parser.add_argument(
+        "--pool",
+        help="Whether to use multi-processing or multi-threading",
+        default="process",
+        choices=["process", "thread"],
+        type=str)
 
     # add arguments to parser
     args = parser.parse_args()
@@ -101,9 +116,25 @@
 
     # loop over all books in the raw-folder
     pbooks = 0
-    
-    with concurrent.futures.ProcessPoolExecutor() as pool:
-        book_process_jobs = []
+
+    # find out which jobs were already done
+    re_pattern = args.pattern.replace('*', '.*') # wild card roughly equals .* in regex
+    pattern_text = re.compile('(PG%s)_text.txt' % (re_pattern))
+    pattern_tokens = re.compile('(PG%s)_tokens.txt' % (re_pattern))
+    pattern_counts = re.compile('(PG%s)_counts.txt' % (re_pattern))
+    exist_text = {pattern_text.fullmatch(f) for f in 
+                  glob.glob('PG%s_text.txt' % (args.pattern), root_dir=text_dir)}
+    exist_tokens = {pattern_tokens.fullmatch(f) for f in 
+                    glob.glob('PG%s_tokens.txt' % (args.pattern), root_dir=tokens_dir)}
+    exist_counts = {pattern_counts.fullmatch(f) for f in 
+                    glob.glob('PG%s_counts.txt' % (args.pattern), root_dir=counts_dir)}
+    exist_text = {f.group(1) for f in exist_text if f}
+    exist_tokens = {f.group(1) for f in exist_tokens if f}
+    exist_counts = {f.group(1) for f in exist_counts if f}
+    done_jobs = exist_text & exist_tokens & exist_counts
+
+    with eval("concurrent.futures.%sPoolExecutor()" % args.pool.capitalize()) as pool:
+        book_process_jobs = dict()
         for filename in glob.glob(join(raw_dir, 'PG%s_raw.txt' % (args.pattern))):
             # The process_books function will fail very rarely, whne
             # a file tagged as UTf-8 is not really UTF-8. We kust
@@ -111,31 +142,46 @@
             # get PG_id
             PG_id = os.path.split(filename)[-1].split("_")[0]
 
-            # get language from metadata
-            # default is english
-            language = "english"
-            # language is a string representing a list of languages codes
-            lang_id = ast.literal_eval(metadata.loc[PG_id, "language"])[0]
-            if lang_id in langs_dict.keys():
-                language = langs_dict[lang_id]
-
-            # process the book: strip headers, tokenize, count
-            book_process_jobs.append(pool.submit(
-                process_book,
-                path_to_raw_file=filename,
-                text_dir=text_dir,
-                tokens_dir=tokens_dir,
-                counts_dir=counts_dir,
-                language=language,
-                log_file=args.log_file))
-            
+            if PG_id not in done_jobs:
+                # get language from metadata
+                # default is english
+                language = "english"
+                try:
+                    # language is a string representing a list of languages codes
+                    lang_id = ast.literal_eval(metadata.loc[PG_id, "language"])[0]
+                    if lang_id in langs_dict.keys():
+                        language = langs_dict[lang_id]
+                except KeyError:
+                    if not args.quiet:
+                        msg = "# WARNING: metadata for '%s' not found" % filename
+                        print(msg)
+                        if args.log_file:
+                            with io.open(args.log_file, "a") as f:
+                                f.write(msg + '\n')
+
+                # process the book: strip headers, tokenize, count
+                book_process_jobs[
+                    pool.submit(
+                        process_book,
+                        path_to_raw_file=filename,
+                        text_dir=text_dir,
+                        tokens_dir=tokens_dir,
+                        counts_dir=counts_dir,
+                        overwrite_all=True,
+                        language=language,
+                        log_file=args.log_file,
+                        ignore=args.ignore)
+                ] = PG_id
+        
             pbooks += 1
             if not args.quiet:
                 print("%d book processing jobs started..." % pbooks, end="\r")
         
-        print()
+        print("\n%d book processing jobs created in total" % len(book_process_jobs))
+
         pbooks = 0
         for job in concurrent.futures.as_completed(book_process_jobs):
+            PG_id = book_process_jobs[job]
             if args.log_file:
                 try:
                     log_content = job.result()
@@ -143,18 +189,15 @@
                         f.write(log_content)
                 except UnicodeDecodeError:
                     if not args.quiet:
-                        print("# WARNING: cannot process '%s' (encoding not UTF-8)" % filename)
-                except KeyError:
-                    if not args.quiet:
-                        print("# WARNING: metadata for '%s' not found" % filename)
+                        print("# WARNING: cannot process '%s' (encoding not UTF-8)" % PG_id)
                 except LookupError as e:
                     print("Very likely that an NLTK resource needs to be downloaded")
                     raise e
                 except Exception as e:
                     if not args.quiet:
-                        print("# WARNING: cannot process '%s' (unkown error)" % filename)
+                        print("# WARNING: cannot process '%s' (unkown error)" % PG_id)
                         raise e
             pbooks += 1
             if not args.quiet:
                 print("Processed %d books..." % pbooks, end="\r")
-            
+        print("\ndone")

From 1a06f5306c2dd7aa2984ea91604297f5af5b537d Mon Sep 17 00:00:00 2001
From: Hugo^3 <pondypondo@outlook.com>
Date: Thu, 4 Jan 2024 17:05:46 +0000
Subject: [PATCH 12/20] Added in the missing `get_bookshelves()` call in
 get_data.py and an utility function for checking if a file is empty

---
 get_data.py  | 1 +
 src/utils.py | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/get_data.py b/get_data.py
index 5cdc3f0..11354dc 100644
--- a/get_data.py
+++ b/get_data.py
@@ -183,6 +183,7 @@
     # -----------
     # Get bookshelves and their respective books and titles as dicts
     if 'b' in args.procedures:
+        get_bookshelves()
         BS_dict, BS_num_to_category_str_dict = parse_bookshelves()
         with open("metadata/bookshelves_ebooks_dict.pkl", 'wb') as fp:
             pickle.dump(BS_dict, fp)
diff --git a/src/utils.py b/src/utils.py
index b8ed09d..6b5a8cd 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -3,6 +3,7 @@
 import shutil
 import subprocess
 import glob
+import io
 from sys import platform
 
 # add support for windows
@@ -162,4 +163,6 @@ def remove_empty_dirs(path: str, quiet: bool=False):
                 if not quiet:
                     print(f"Removed empty directory: {subdir_path}")
 
-
+def check_not_empty(fname: str) -> bool:
+    with io.open(fname, errors="ignore", encoding="utf-8") as f:
+        return bool(f.read().strip())

From cfef209b66aaa32dc685dc28e102c314dc7b787b Mon Sep 17 00:00:00 2001
From: Hugo^3 <pondypondo@outlook.com>
Date: Thu, 4 Jan 2024 17:19:24 +0000
Subject: [PATCH 13/20] Added an option to check if any of the resultant files
 are empty before assuming a book is "done"

---
 .gitignore      |  6 ++++++
 process_data.py | 34 +++++++++++++++++++++++++++++++++-
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index df32ff6..639fa2c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -117,3 +117,9 @@ cwRsync*/*
 
 # nltk data directory
 src/nltk_data/**
+
+# Wget temporary directory
+*gutenberg*/
+
+# Jupyter notebooks for processing data
+*.ipynb
\ No newline at end of file
diff --git a/process_data.py b/process_data.py
index 566f909..bdfc75a 100644
--- a/process_data.py
+++ b/process_data.py
@@ -16,7 +16,7 @@
 import re
 
 from src.pipeline import process_book
-from src.utils import get_langs_dict, is_win32
+from src.utils import get_langs_dict, check_not_empty, is_win32
 
 
 if __name__ == '__main__':
@@ -72,6 +72,12 @@
         default=".log",
         type=str)
     
+    # check if existing files are empty
+    parser.add_argument(
+        "-c", "--check_empty",
+        action="store_true",
+        help="Whether to check if existing files are empty")
+    
     # whether to ignore UTF-8 decoding errors
     parser.add_argument(
         "--ignore",
@@ -128,10 +134,36 @@
                     glob.glob('PG%s_tokens.txt' % (args.pattern), root_dir=tokens_dir)}
     exist_counts = {pattern_counts.fullmatch(f) for f in 
                     glob.glob('PG%s_counts.txt' % (args.pattern), root_dir=counts_dir)}
+    
     exist_text = {f.group(1) for f in exist_text if f}
     exist_tokens = {f.group(1) for f in exist_tokens if f}
     exist_counts = {f.group(1) for f in exist_counts if f}
     done_jobs = exist_text & exist_tokens & exist_counts
+    del exist_text, exist_tokens, exist_counts
+
+    if args.check_empty:
+        with eval("concurrent.futures.%sPoolExecutor()" % args.pool.capitalize()) as pool:
+            tmp0 = [(os.path.join(text_dir, PG_id) + "_text.txt", PG_id) for PG_id in done_jobs]
+            tmp1 = [(os.path.join(tokens_dir, PG_id) + "_tokens.txt", PG_id) for PG_id in done_jobs]
+            tmp2 = [(os.path.join(counts_dir, PG_id) + "_counts.txt", PG_id) for PG_id in done_jobs]
+            validate_jobs0 = {pool.submit(check_not_empty, f) : PG_id for f, PG_id in tmp0}
+            validate_jobs1 = {pool.submit(check_not_empty, f) : PG_id for f, PG_id in tmp1}
+            validate_jobs2 = {pool.submit(check_not_empty, f) : PG_id for f, PG_id in tmp2}
+            validation_results = {PG_id : 0 for PG_id in done_jobs}
+            if not args.quiet:
+                print("%d books to check for completion (3 passes required)" % len(done_jobs))
+            for job_type in [validate_jobs0, validate_jobs1, validate_jobs2]:
+                pbooks = 0
+                for job in concurrent.futures.as_completed(job_type):
+                    if job.result():
+                        validation_results[job_type[job]] += 1
+                    pbooks += 1
+                    if (not args.quiet) and (pbooks % 100 == 0):
+                        print("%6d books checked for completion" % pbooks, end="\r")
+            done_jobs = {PG_id for PG_id in validation_results if validation_results[PG_id] == 3}
+            if not args.quiet:
+                print("%d books seem to be processed but have empty file(s)" %(len(validation_results) - len(done_jobs)))
+            del tmp0, tmp1, tmp2, validate_jobs0, validate_jobs1, validate_jobs2, validation_results
 
     with eval("concurrent.futures.%sPoolExecutor()" % args.pool.capitalize()) as pool:
         book_process_jobs = dict()

From c183413d6621d4a921814fd3c20f61c716f04166 Mon Sep 17 00:00:00 2001
From: Hugo^3 <pondypondo@outlook.com>
Date: Fri, 5 Jan 2024 01:07:39 +0000
Subject: [PATCH 14/20] Fixed several bugs in bookshelf-related code: - used
 `shutil.rmtree` instead of `os.rmdir` since latter is only for empty dir and
 added check for existence of dir to remove (win32) - removed `-p` option when
 calling wget to avoid downloading large amount of useless data (original
 code) - filtered out many garbage data (non-book weblinks) in bookshelves
 dicts and removed "PG" prefix for values of index.html in
 bookshelves_ebooks_dict.pkl (original code)

---
 src/bookshelves.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/bookshelves.py b/src/bookshelves.py
index 7ed8b2f..791b31c 100644
--- a/src/bookshelves.py
+++ b/src/bookshelves.py
@@ -13,13 +13,13 @@
 def rm_dir(*args):
     subprocess.call(["rm", "-rf"] + args)
 
-rm_pattern = rm_dir
-
 if is_win32:
-    rm_dir = os.rmdir
+    rm_dir = shutil.rmtree
     def rm_pattern(path):
-        for file in glob.glob(path):
+        for file in glob.glob(path.replace('/', os.path.sep)):
             os.remove(file)
+else:
+    rm_pattern = rm_dir
 
 def get_bookshelves():
     """
@@ -31,8 +31,8 @@ def get_bookshelves():
 
     """
     sp_args = ["wget",
-               "--random-wait", "-r", 
-               "-p", "--no-parent", 
+               "--random-wait", "-r",
+               "--no-parent", 
                "-e", "robots=off", 
                "-U", "mozilla", 
                "https://www.gutenberg.org/ebooks/bookshelf/"
@@ -51,7 +51,8 @@ def get_bookshelves():
         subprocess.call(sp_args, shell=True)
 
     # cleanup
-    rm_dir("www.gutenberg.org")
+    if os.path.exists("www.gutenberg.org"):
+        rm_dir("www.gutenberg.org")
     # in the new version of the website and with these parameters of the wget (gets also other links within the crawled page)
     # we get also other files, copy of the bookshelves but with different ordering
     # remove them
@@ -78,15 +79,17 @@ def parse_bookshelves():
     BS_num_to_category_str_dict = {}
     for path in BS_paths:
         _, bs = os.path.split(path)
+        PG_header = '' if bs == "index.html" else "PG"
         BS_dict[bs] = []
         with open(path, "r", encoding="UTF-8") as foo:
-            dom = lxml.html.fromstring(foo.read())
+            dom = lxml.html.parse(path)
             # select the url in href for all a tags(links)
             for link in dom.xpath('//a/@href'):
                 # links to ebooks that are not searches
                 if link.find("ebooks") > -1 and link.find("search") == -1:
-                    PGid = "PG"+link.split("/")[-1]
-                    BS_dict[bs].append(PGid)
+                    book_id = link.split("/")[-1]
+                    if book_id.isdigit():
+                        BS_dict[bs].append(PG_header + book_id)
             # get title of the category
             title_categories = dom.findall('.//title') # './/title' finds recursively the element with tag 'title'
             # check if there is only one title in the metadata of the category

From 93208e6c421ae92214b5031889f5baf6ca91d331 Mon Sep 17 00:00:00 2001
From: Hugo^3 <pondypondo@outlook.com>
Date: Fri, 5 Jan 2024 01:10:57 +0000
Subject: [PATCH 15/20] Further extended the procedures option to allow for
 parsing/saving of bookshelves info without running time-consuming Wget

---
 get_data.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/get_data.py b/get_data.py
index 11354dc..c39810e 100644
--- a/get_data.py
+++ b/get_data.py
@@ -92,8 +92,9 @@
         - find [d]uplicates
         - hard [l]ink from mirror to raw
         - get [m]etadata
-        - get [b]ookshelf information''',
-        default='pdlmb',
+        - get [b]ookshelf information
+        - [s]tore bookshelf information''',
+        default='pdlmbs',
         type=str)
 
     # create the parser
@@ -184,6 +185,8 @@
     # Get bookshelves and their respective books and titles as dicts
     if 'b' in args.procedures:
         get_bookshelves()
+    
+    if 's' in args.procedures:
         BS_dict, BS_num_to_category_str_dict = parse_bookshelves()
         with open("metadata/bookshelves_ebooks_dict.pkl", 'wb') as fp:
             pickle.dump(BS_dict, fp)

From 8c6b530f0c464be1d5ce1ec9ad3a35c664e48c94 Mon Sep 17 00:00:00 2001
From: Hugo^3 <pondypondo@outlook.com>
Date: Fri, 5 Jan 2024 01:43:45 +0000
Subject: [PATCH 16/20] Corrected help message for procedures option

---
 get_data.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/get_data.py b/get_data.py
index c39810e..0ebdf8d 100644
--- a/get_data.py
+++ b/get_data.py
@@ -87,13 +87,13 @@
     # rsync command
     parser.add_argument(
         "--procedures",
-        help='''Procedures to go through, defaults to \"pdlmb\": 
-        - [p]ull mirror files
-        - find [d]uplicates
-        - hard [l]ink from mirror to raw
-        - get [m]etadata
-        - get [b]ookshelf information
-        - [s]tore bookshelf information''',
+        help='''Procedures to go through, defaults to \"pdlmbs\":
+        [p]ull mirror files;
+        find [d]uplicates;
+        hard [l]ink from mirror to raw;
+        get [m]etadata;
+        get [b]ookshelf information;
+        [s]tore bookshelf information''',
         default='pdlmbs',
         type=str)
 

From b81b321574bf3091e6c43d4d7b3399670273325b Mon Sep 17 00:00:00 2001
From: Hugo <pondypondo@outlook.com>
Date: Fri, 5 Jan 2024 09:47:18 +0800
Subject: [PATCH 17/20] Update README.md

---
 README.md | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 77 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7bd926f..43a213c 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,13 @@ SPGC-2018-07-18 contains the `tokens/` and `counts/` files of all books that wer
 
 For **most other use cases**, however, you probably want the latest, most recent version of the corpus, in which case you should use this repository to **generate the corpus locally** on your computer. In particular, you will need to generate the corpus locally if you need to work with the original full text files in `raw/` and `text/`, since these are not included in the SPGC-2018-07-18 Zenodo dataset.
 
+## Changes in this fork
+- Windows support (still need to install `wget` and `cwRsync` (cwRsync tested with 5.4.1)
+- Fixed stuffs in original code:
+  - oversights (bookshelves info are never fetched, nltk missing download, utf-8 decoding error in ebook header, etc.)
+  - bugs & typos
+- Parallelised text processing
+- Additional arguments for customization
 
 ## Installation
 :warning: **Python 2.x is not supported** Please make sure your system runs Python 3.x. (https://pythonclock.org/).  
@@ -43,7 +50,7 @@ python get_data.py
 This will download a copy of all UTF-8 books in PG and will create a csv file with metadata (e.g. author, title, year, ...).
 
 Notice that if you already have some of the data, the program will only download those you are missing (we use `rsync` for this). It is hence easy to update the dataset periodically to keep it up-to-date by just running `get_data.py`.
-
+> For Windows users, see the [**Usage**](#usage) section
 
 ## Processing the data
 To process all the data in the `raw/` directory, run
@@ -51,6 +58,75 @@ To process all the data in the `raw/` directory, run
 python process_data.py
 ```
 This will fill in the `text/`, `tokens/` and `counts/` folders.
+> To avoid losing ebooks that are actually UTF-8 but mistakenly removed in the original code, see the [**Usage**](#usage) section
 
+## Usage
+**Recommended usage for `get_data.py` (Windows user):** 
+```bash
+python get_data.py --rsync "cwRsync_5.4.1/rsync"
+```
+(replace `cwRsync_5.4.1/rsync` with path to your rsync binary, `.exe` is not needed)
+
+**Recommended usage for `process_data.py`:**
+```bash
+python process_data.py --ignore
+```
 
+**How to use `get_data.py` with customisation options:**
+```
+python get_data.py --help
+usage: Update local PG repository.
+
+This script will download all books currently not in your
+local copy of PG and get the latest version of the metadata.
+
+       [-h] [-m MIRROR] [-r RAW] [-M METADATA] [-p PATTERN] [-k] [-owr] [-q] [-c] [--rsync RSYNC] [--procedures PROCEDURES]
+
+options:
+  -h, --help            show this help message and exit
+  -m MIRROR, --mirror MIRROR
+                        Path to the mirror folder that will be updated via rsync.
+  -r RAW, --raw RAW     Path to the raw folder.
+  -M METADATA, --metadata METADATA
+                        Path to the metadata folder.
+  -p PATTERN, --pattern PATTERN
+                        Patterns to get only a subset of books.
+  -k, --keep_rdf        If there is an RDF file in metadata dir, do not overwrite it.
+  -owr, --overwrite_raw
+                        Overwrite files in raw.
+  -q, --quiet           Quiet mode, do not print info, warnings, etc
+  -c, --clean           Clean the mirror directory to remove any empty folders
+  --rsync RSYNC         Specify an alternative rsync command
+  --procedures PROCEDURES
+                        Procedures to go through, defaults to "pdlmbs": [p]ull mirror files; find [d]uplicates; hard [l]ink from mirror to raw;   
+                        get [m]etadata; get [b]ookshelf information; [s]tore bookshelf information
+```
 
+**How to use `process_data.py` with customisation options:**
+```
+python process_data.py --help
+[nltk_data] Downloading package punkt to src/nltk_data...
+[nltk_data]   Package punkt is already up-to-date!
+usage: Processing raw texts from Project Gutenberg: i) removing headers,ii) tokenizing, and iii) counting words.
+       [-h] [-r RAW] [-ote OUTPUT_TEXT] [-oto OUTPUT_TOKENS] [-oco OUTPUT_COUNTS] [-p PATTERN] [-q] [-l LOG_FILE] [-c] [--ignore]
+       [--pool {process,thread}]
+
+options:
+  -h, --help            show this help message and exit
+  -r RAW, --raw RAW     Path to the raw-folder
+  -ote OUTPUT_TEXT, --output_text OUTPUT_TEXT
+                        Path to text-output (text_dir)
+  -oto OUTPUT_TOKENS, --output_tokens OUTPUT_TOKENS
+                        Path to tokens-output (tokens_dir)
+  -oco OUTPUT_COUNTS, --output_counts OUTPUT_COUNTS
+                        Path to counts-output (counts_dir)
+  -p PATTERN, --pattern PATTERN
+                        Pattern to specify a subset of books
+  -q, --quiet           Quiet mode, do not print info, warnings, etc
+  -l LOG_FILE, --log_file LOG_FILE
+                        Path to log file
+  -c, --check_empty     Whether to check if existing files are empty
+  --ignore              Whether to ignore UTF-8 decoding errors
+  --pool {process,thread}
+                        Whether to use multi-processing or multi-threading
+```

From 0349d282989dfe5aa4c552616ac18dd634920db5 Mon Sep 17 00:00:00 2001
From: Hugo <pondypondo@outlook.com>
Date: Fri, 5 Jan 2024 09:51:59 +0800
Subject: [PATCH 18/20] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 43a213c..e8d8a17 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,8 @@ For **most other use cases**, however, you probably want the latest, most recent
   - bugs & typos
 - Parallelised text processing
 - Additional arguments for customization
+> **Note:**
+> this fork has only been tested on Windows (yet), but should work on other platforms unless the original code doesn't work in the first place?
 
 ## Installation
 :warning: **Python 2.x is not supported** Please make sure your system runs Python 3.x. (https://pythonclock.org/).  
@@ -105,8 +107,6 @@ options:
 **How to use `process_data.py` with customisation options:**
 ```
 python process_data.py --help
-[nltk_data] Downloading package punkt to src/nltk_data...
-[nltk_data]   Package punkt is already up-to-date!
 usage: Processing raw texts from Project Gutenberg: i) removing headers,ii) tokenizing, and iii) counting words.
        [-h] [-r RAW] [-ote OUTPUT_TEXT] [-oto OUTPUT_TOKENS] [-oco OUTPUT_COUNTS] [-p PATTERN] [-q] [-l LOG_FILE] [-c] [--ignore]
        [--pool {process,thread}]

From a96bf1141ca804fa02815f8d1e8d913671729c8f Mon Sep 17 00:00:00 2001
From: Hugo <pondypondo@outlook.com>
Date: Fri, 5 Jan 2024 09:54:22 +0800
Subject: [PATCH 19/20] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index e8d8a17..a24be6e 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ For **most other use cases**, however, you probably want the latest, most recent
   - oversights (bookshelves info are never fetched, nltk missing download, utf-8 decoding error in ebook header, etc.)
   - bugs & typos
 - Parallelised text processing
-- Additional arguments for customization
+- Additional arguments for customisation (see [**Usage**](#usage) section)
 > **Note:**
 > this fork has only been tested on Windows (yet), but should work on other platforms unless the original code doesn't work in the first place?
 
@@ -52,7 +52,7 @@ python get_data.py
 This will download a copy of all UTF-8 books in PG and will create a csv file with metadata (e.g. author, title, year, ...).
 
 Notice that if you already have some of the data, the program will only download those you are missing (we use `rsync` for this). It is hence easy to update the dataset periodically to keep it up-to-date by just running `get_data.py`.
-> For Windows users, see the [**Usage**](#usage) section
+> For Windows users, see [**Usage**](#usage) section
 
 ## Processing the data
 To process all the data in the `raw/` directory, run
@@ -60,7 +60,7 @@ To process all the data in the `raw/` directory, run
 python process_data.py
 ```
 This will fill in the `text/`, `tokens/` and `counts/` folders.
-> To avoid losing ebooks that are actually UTF-8 but mistakenly removed in the original code, see the [**Usage**](#usage) section
+> To avoid losing ebooks that are actually UTF-8 but mistakenly removed in the original code, see [**Usage**](#usage) section
 
 ## Usage
 **Recommended usage for `get_data.py` (Windows user):** 

From 0764543b615aeba59af6aa6e9e4dc7eb79b46804 Mon Sep 17 00:00:00 2001
From: Hugo <pondypondo@outlook.com>
Date: Fri, 5 Jan 2024 10:01:48 +0800
Subject: [PATCH 20/20] Update README.md

---
 README.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a24be6e..ef32add 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,8 @@ For **most other use cases**, however, you probably want the latest, most recent
 
 ## Changes in this fork
 - Windows support (still need to install `wget` and `cwRsync` (cwRsync tested with 5.4.1)
-- Fixed stuffs in original code:
+- Patched stuffs in original code:
+  - unwanted garbage in bookshelves info (probably due to Project Gutenberg website updating)
   - oversights (bookshelves info are never fetched, nltk missing download, utf-8 decoding error in ebook header, etc.)
   - bugs & typos
 - Parallelised text processing
@@ -24,6 +25,13 @@ For **most other use cases**, however, you probably want the latest, most recent
 > **Note:**
 > this fork has only been tested on Windows (yet), but should work on other platforms unless the original code doesn't work in the first place?
 
+## Todo
+- Better tokenisation rules?
+  - Chinese books are all empty after tokenisation -> use jieba, probably?
+  - Only tokens that return `True` for `str.isalpha()` are kept currently
+- Faster method for getting bookselves info?
+
+
 ## Installation
 :warning: **Python 2.x is not supported** Please make sure your system runs Python 3.x. (https://pythonclock.org/).