diff --git a/.gitignore b/.gitignore
index 5f5032e..639fa2c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -108,4 +108,18 @@ ENV/
 .mypy_cache/
 
 # VScode
-.vscode/
\ No newline at end of file
+.vscode/
+
+# Windows dependencies / batch files
+cwRsync*/*
+*.exe
+*.bat
+
+# nltk data directory
+src/nltk_data/**
+
+# Wget temporary directory
+*gutenberg*/
+
+# Jupyter notebooks for processing data
+*.ipynb
\ No newline at end of file
diff --git a/README.md b/README.md
index 7bd926f..ef32add 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,23 @@ SPGC-2018-07-18 contains the `tokens/` and `counts/` files of all books that wer
 
 For **most other use cases**, however, you probably want the latest, most recent version of the corpus, in which case you should use this repository to **generate the corpus locally** on your computer. In particular, you will need to generate the corpus locally if you need to work with the original full text files in `raw/` and `text/`, since these are not included in the SPGC-2018-07-18 Zenodo dataset.
 
+## Changes in this fork
+- Windows support (still need to install `wget` and `cwRsync` (cwRsync tested with 5.4.1)
+- Patched stuffs in original code:
+  - unwanted garbage in bookshelves info (probably due to Project Gutenberg website updating)
+  - oversights (bookshelves info are never fetched, nltk missing download, utf-8 decoding error in ebook header, etc.)
+  - bugs & typos
+- Parallelised text processing
+- Additional arguments for customisation (see [**Usage**](#usage) section)
+> **Note:**
+> this fork has only been tested on Windows (yet), but should work on other platforms unless the original code doesn't work in the first place?
+
+## Todo
+- Better tokenisation rules?
+  - Chinese books are all empty after tokenisation -> use jieba, probably?
+  - Only tokens that return `True` for `str.isalpha()` are kept currently
+- Faster method for getting bookselves info?
+
 
 ## Installation
 :warning: **Python 2.x is not supported** Please make sure your system runs Python 3.x. (https://pythonclock.org/).  
@@ -43,7 +60,7 @@ python get_data.py
 This will download a copy of all UTF-8 books in PG and will create a csv file with metadata (e.g. author, title, year, ...).
 
 Notice that if you already have some of the data, the program will only download those you are missing (we use `rsync` for this). It is hence easy to update the dataset periodically to keep it up-to-date by just running `get_data.py`.
-
+> For Windows users, see [**Usage**](#usage) section
 
 ## Processing the data
 To process all the data in the `raw/` directory, run
@@ -51,6 +68,73 @@ To process all the data in the `raw/` directory, run
 python process_data.py
 ```
 This will fill in the `text/`, `tokens/` and `counts/` folders.
+> To avoid losing ebooks that are actually UTF-8 but mistakenly removed in the original code, see [**Usage**](#usage) section
 
+## Usage
+**Recommended usage for `get_data.py` (Windows user):** 
+```bash
+python get_data.py --rsync "cwRsync_5.4.1/rsync"
+```
+(replace `cwRsync_5.4.1/rsync` with path to your rsync binary, `.exe` is not needed)
 
+**Recommended usage for `process_data.py`:**
+```bash
+python process_data.py --ignore
+```
 
+**How to use `get_data.py` with customisation options:**
+```
+python get_data.py --help
+usage: Update local PG repository.
+
+This script will download all books currently not in your
+local copy of PG and get the latest version of the metadata.
+
+       [-h] [-m MIRROR] [-r RAW] [-M METADATA] [-p PATTERN] [-k] [-owr] [-q] [-c] [--rsync RSYNC] [--procedures PROCEDURES]
+
+options:
+  -h, --help            show this help message and exit
+  -m MIRROR, --mirror MIRROR
+                        Path to the mirror folder that will be updated via rsync.
+  -r RAW, --raw RAW     Path to the raw folder.
+  -M METADATA, --metadata METADATA
+                        Path to the metadata folder.
+  -p PATTERN, --pattern PATTERN
+                        Patterns to get only a subset of books.
+  -k, --keep_rdf        If there is an RDF file in metadata dir, do not overwrite it.
+  -owr, --overwrite_raw
+                        Overwrite files in raw.
+  -q, --quiet           Quiet mode, do not print info, warnings, etc
+  -c, --clean           Clean the mirror directory to remove any empty folders
+  --rsync RSYNC         Specify an alternative rsync command
+  --procedures PROCEDURES
+                        Procedures to go through, defaults to "pdlmbs": [p]ull mirror files; find [d]uplicates; hard [l]ink from mirror to raw;   
+                        get [m]etadata; get [b]ookshelf information; [s]tore bookshelf information
+```
+
+**How to use `process_data.py` with customisation options:**
+```
+python process_data.py --help
+usage: Processing raw texts from Project Gutenberg: i) removing headers,ii) tokenizing, and iii) counting words.
+       [-h] [-r RAW] [-ote OUTPUT_TEXT] [-oto OUTPUT_TOKENS] [-oco OUTPUT_COUNTS] [-p PATTERN] [-q] [-l LOG_FILE] [-c] [--ignore]
+       [--pool {process,thread}]
+
+options:
+  -h, --help            show this help message and exit
+  -r RAW, --raw RAW     Path to the raw-folder
+  -ote OUTPUT_TEXT, --output_text OUTPUT_TEXT
+                        Path to text-output (text_dir)
+  -oto OUTPUT_TOKENS, --output_tokens OUTPUT_TOKENS
+                        Path to tokens-output (tokens_dir)
+  -oco OUTPUT_COUNTS, --output_counts OUTPUT_COUNTS
+                        Path to counts-output (counts_dir)
+  -p PATTERN, --pattern PATTERN
+                        Pattern to specify a subset of books
+  -q, --quiet           Quiet mode, do not print info, warnings, etc
+  -l LOG_FILE, --log_file LOG_FILE
+                        Path to log file
+  -c, --check_empty     Whether to check if existing files are empty
+  --ignore              Whether to ignore UTF-8 decoding errors
+  --pool {process,thread}
+                        Whether to use multi-processing or multi-threading
+```
diff --git a/get_data.py b/get_data.py
index 5012e68..0ebdf8d 100644
--- a/get_data.py
+++ b/get_data.py
@@ -5,7 +5,7 @@
 M. Gerlach & F. Font-Clos
 
 """
-from src.utils import populate_raw_from_mirror, list_duplicates_in_mirror
+from src.utils import populate_raw_from_mirror, list_duplicates_in_mirror, remove_empty_dirs, is_win32
 from src.metadataparser import make_df_metadata
 from src.bookshelves import get_bookshelves
 from src.bookshelves import parse_bookshelves
@@ -22,6 +22,7 @@
         "This script will download all books currently not in your\n"
         "local copy of PG and get the latest version of the metadata.\n"
         )
+    
     # mirror dir
     parser.add_argument(
         "-m", "--mirror",
@@ -68,16 +69,50 @@
         action="store_true",
         help="Quiet mode, do not print info, warnings, etc"
         )
+    
+    # clean argument, to supress info
+    parser.add_argument(
+        "-c", "--clean",
+        action="store_true",
+        help="Clean the mirror directory to remove any empty folders"
+        )
+    
+    # rsync command
+    parser.add_argument(
+        "--rsync",
+        help="Specify an alternative rsync command",
+        default='rsync',
+        type=str)
+    
+    # rsync command
+    parser.add_argument(
+        "--procedures",
+        help='''Procedures to go through, defaults to \"pdlmbs\":
+        [p]ull mirror files;
+        find [d]uplicates;
+        hard [l]ink from mirror to raw;
+        get [m]etadata;
+        get [b]ookshelf information;
+        [s]tore bookshelf information''',
+        default='pdlmbs',
+        type=str)
 
     # create the parser
     args = parser.parse_args()
+    mirror_dir, raw_dir, metadata_dir = args.mirror, args.raw, args.metadata
+    
+    if is_win32:
+        print("Windows detected, please make sure wget is installed and added to PATH")
+        mirror_dir = mirror_dir.replace('/', '\\')
+        raw_dir = raw_dir.replace('/', '\\')
+        metadata_dir = metadata_dir.replace('/', '\\')
 
     # check that all dirs exist
-    if not os.path.isdir(args.mirror):
+    if not os.path.isdir(mirror_dir):
         raise ValueError("The specified mirror directory does not exist.")
-    if not os.path.isdir(args.raw):
+    if not os.path.isdir(raw_dir):
         raise ValueError("The specified raw directory does not exist.")
-    if not os.path.isdir(args.metadata):
+    if not os.path.isdir(metadata_dir):
         raise ValueError("The specified metadata directory does not exist.")
 
     # Update the .mirror directory via rsync
@@ -99,49 +134,61 @@
     # + 12345 -   0   .  t x                 t 
     #---------------------------------------------
     #        [.-][t0][x.]t[x.]    *         [t8]
-    sp_args = ["rsync", "-am%s" % vstring,
-               "--include", "*/",
-               "--include", "[p123456789][g0123456789]%s[.-][t0][x.]t[x.]*[t8]" % args.pattern,
-               "--exclude", "*",
-               "aleph.gutenberg.org::gutenberg", args.mirror
-               ]
-    subprocess.call(sp_args)
+    includes = ["*/", "[p123456789][g0123456789]%s[.-][t0][x.]t[x.]*[t8]" % args.pattern]
+    excludes = ["*"]
+    sp_args = ' '.join([args.rsync, "-am%s" % vstring] + ["--include=\"%s\"" % i for i in includes] + \
+        ["--exclude=\"%s\"" % i for i in excludes] + ["aleph.gutenberg.org::gutenberg", mirror_dir])
+    
+    # If specified, remove any empty directory that might be caused by bugs or wrong patterns in rsync
+    if args.clean:
+        remove_empty_dirs(mirror_dir, args.quiet)
+
+    # Subprocess call (default arguments):
+    # rsync -amv --include="*/" --include="[p123456789][g0123456789]*[.-][t0][x.]t[x.]*[t8]" --exclude="*" aleph.gutenberg.org::gutenberg data/.mirror/
+    if 'p' in args.procedures:
+        subprocess.call(sp_args) 
 
     # Get rid of duplicates
     # ---------------------
     # A very small portion of books are stored more than
     # once in PG's site. We keep the newest one, see
     # erase_duplicates_in_mirror docstring.
-    dups_list = list_duplicates_in_mirror(mirror_dir=args.mirror)
+    dups_list = list_duplicates_in_mirror(mirror_dir=mirror_dir) if 'd' in args.procedures else []
 
     # Populate raw from mirror
     # ------------------------
     # We populate 'raw_dir' hardlinking to
     # the hidden 'mirror_dir'. Names are standarized
     # into PG12345_raw.txt form.
-    populate_raw_from_mirror(
-        mirror_dir=args.mirror,
-        raw_dir=args.raw,
-        overwrite=args.overwrite_raw,
-        dups_list=dups_list,
-        quiet=args.quiet
+    if 'l' in args.procedures:
+        populate_raw_from_mirror(
+            mirror_dir=mirror_dir,
+            raw_dir=raw_dir,
+            overwrite=args.overwrite_raw,
+            dups_list=dups_list,
+            quiet=args.quiet
         )
 
     # Update metadata
     # ---------------
     # By default, update the whole metadata csv
     # file each time new data is downloaded.
-    make_df_metadata(
-        path_xml=os.path.join(args.metadata, 'rdf-files.tar.bz2'),
-        path_out=os.path.join(args.metadata, 'metadata.csv'),
-        update=args.keep_rdf
+    if 'm' in args.procedures:
+        make_df_metadata(
+            path_xml=os.path.join(metadata_dir, 'rdf-files.tar.bz2'),
+            path_out=os.path.join(metadata_dir, 'metadata.csv'),
+            update=args.keep_rdf
         )
 
     # Bookshelves
     # -----------
     # Get bookshelves and their respective books and titles as dicts
-    BS_dict, BS_num_to_category_str_dict = parse_bookshelves()
-    with open("metadata/bookshelves_ebooks_dict.pkl", 'wb') as fp:
-        pickle.dump(BS_dict, fp)
-    with open("metadata/bookshelves_categories_dict.pkl", 'wb') as fp:
-        pickle.dump(BS_num_to_category_str_dict, fp)
\ No newline at end of file
+    if 'b' in args.procedures:
+        get_bookshelves()
+    
+    if 's' in args.procedures:
+        BS_dict, BS_num_to_category_str_dict = parse_bookshelves()
+        with open("metadata/bookshelves_ebooks_dict.pkl", 'wb') as fp:
+            pickle.dump(BS_dict, fp)
+        with open("metadata/bookshelves_categories_dict.pkl", 'wb') as fp:
+            pickle.dump(BS_num_to_category_str_dict, fp)
\ No newline at end of file
diff --git a/process_data.py b/process_data.py
index e21b92f..bdfc75a 100644
--- a/process_data.py
+++ b/process_data.py
@@ -11,12 +11,18 @@
 import glob
 import ast
 import pandas as pd
+import concurrent.futures
+import io
+import re
 
 from src.pipeline import process_book
-from src.utils import get_langs_dict
+from src.utils import get_langs_dict, check_not_empty, is_win32
 
 
 if __name__ == '__main__':
+    from src.tokenizer import nltk_dir
+    import nltk
+    nltk.download("punkt", nltk_dir) # avoid lookup error
 
     parser = argparse.ArgumentParser(
         "Processing raw texts from Project Gutenberg:"
@@ -48,7 +54,7 @@
     # pattern to specify subset of books
     parser.add_argument(
         "-p", "--pattern",
-        help="Patttern to specify a subset of books",
+        help="Pattern to specify a subset of books",
         default='*',
         type=str)
 
@@ -65,20 +71,48 @@
         help="Path to log file",
         default=".log",
         type=str)
+    
+    # check if existing files are empty
+    parser.add_argument(
+        "-c", "--check_empty",
+        action="store_true",
+        help="Whether to check if existing files are empty")
+    
+    # whether to ignore UTF-8 decoding errors
+    parser.add_argument(
+        "--ignore",
+        action="store_true",
+        help="Whether to ignore UTF-8 decoding errors")
+    
+    # multi-threading/processing choice
+    parser.add_argument(
+        "--pool",
+        help="Whether to use multi-processing or multi-threading",
+        default="process",
+        choices=["process", "thread"],
+        type=str)
 
     # add arguments to parser
     args = parser.parse_args()
+    raw_dir, text_dir, tokens_dir, counts_dir = args.raw, args.output_text, args.output_tokens, args.output_counts
+    
+    if is_win32:
+        print("Windows detected")
+        raw_dir = raw_dir.replace('/', '\\')
+        text_dir = text_dir.replace('/', '\\')
+        tokens_dir = tokens_dir.replace('/', '\\')
+        counts_dir = counts_dir.replace('/', '\\')
 
     # check whether the out-put directories exist
-    if os.path.isdir(args.output_text) is False:
+    if os.path.isdir(text_dir) is False:
         raise ValueError("The directory for output of texts '%s' "
-                         "does not exist" % (args.output_text))
-    if os.path.isdir(args.output_tokens) is False:
+                         "does not exist" % (text_dir))
+    if os.path.isdir(tokens_dir) is False:
         raise ValueError("The directory for output of tokens '%s' "
-                         "does not exist" % (args.output_tokens))
-    if os.path.isdir(args.output_counts) is False:
+                         "does not exist" % (tokens_dir))
+    if os.path.isdir(counts_dir) is False:
         raise ValueError("The directory for output of counts '%s' "
-                         "does not exist" % (args.output_counts))
+                         "does not exist" % (counts_dir))
 
     # load metadata
     metadata = pd.read_csv("metadata/metadata.csv").set_index("id")
@@ -88,40 +122,114 @@
 
     # loop over all books in the raw-folder
     pbooks = 0
-    for filename in glob.glob(join(args.raw, 'PG%s_raw.txt' % (args.pattern))):
-        # The process_books function will fail very rarely, whne
-        # a file tagged as UTf-8 is not really UTF-8. We kust
-        # skip those books.
-        try:
-            # get PG_id
-            PG_id = filename.split("/")[-1].split("_")[0]
-
-            # get language from metadata
-            # default is english
-            language = "english"
-            # language is a string representing a list of languages codes
-            lang_id = ast.literal_eval(metadata.loc[PG_id, "language"])[0]
-            if lang_id in langs_dict.keys():
-                language = langs_dict[lang_id]
-
-            # process the book: strip headers, tokenize, count
-            process_book(
-                path_to_raw_file=filename,
-                text_dir=args.output_text,
-                tokens_dir=args.output_tokens,
-                counts_dir=args.output_counts,
-                language=language,
-                log_file=args.log_file
-            )
-            pbooks += 1
+
+    # find out which jobs were already done
+    re_pattern = args.pattern.replace('*', '.*') # wild card roughly equals .* in regex
+    pattern_text = re.compile('(PG%s)_text.txt' % (re_pattern))
+    pattern_tokens = re.compile('(PG%s)_tokens.txt' % (re_pattern))
+    pattern_counts = re.compile('(PG%s)_counts.txt' % (re_pattern))
+    exist_text = {pattern_text.fullmatch(f) for f in 
+                  glob.glob('PG%s_text.txt' % (args.pattern), root_dir=text_dir)}
+    exist_tokens = {pattern_tokens.fullmatch(f) for f in 
+                    glob.glob('PG%s_tokens.txt' % (args.pattern), root_dir=tokens_dir)}
+    exist_counts = {pattern_counts.fullmatch(f) for f in 
+                    glob.glob('PG%s_counts.txt' % (args.pattern), root_dir=counts_dir)}
+    
+    exist_text = {f.group(1) for f in exist_text if f}
+    exist_tokens = {f.group(1) for f in exist_tokens if f}
+    exist_counts = {f.group(1) for f in exist_counts if f}
+    done_jobs = exist_text & exist_tokens & exist_counts
+    del exist_text, exist_tokens, exist_counts
+
+    if args.check_empty:
+        with eval("concurrent.futures.%sPoolExecutor()" % args.pool.capitalize()) as pool:
+            tmp0 = [(os.path.join(text_dir, PG_id) + "_text.txt", PG_id) for PG_id in done_jobs]
+            tmp1 = [(os.path.join(tokens_dir, PG_id) + "_tokens.txt", PG_id) for PG_id in done_jobs]
+            tmp2 = [(os.path.join(counts_dir, PG_id) + "_counts.txt", PG_id) for PG_id in done_jobs]
+            validate_jobs0 = {pool.submit(check_not_empty, f) : PG_id for f, PG_id in tmp0}
+            validate_jobs1 = {pool.submit(check_not_empty, f) : PG_id for f, PG_id in tmp1}
+            validate_jobs2 = {pool.submit(check_not_empty, f) : PG_id for f, PG_id in tmp2}
+            validation_results = {PG_id : 0 for PG_id in done_jobs}
             if not args.quiet:
-                print("Processed %d books..." % pbooks, end="\r")
-        except UnicodeDecodeError:
+                print("%d books to check for completion (3 passes required)" % len(done_jobs))
+            for job_type in [validate_jobs0, validate_jobs1, validate_jobs2]:
+                pbooks = 0
+                for job in concurrent.futures.as_completed(job_type):
+                    if job.result():
+                        validation_results[job_type[job]] += 1
+                    pbooks += 1
+                    if (not args.quiet) and (pbooks % 100 == 0):
+                        print("%6d books checked for completion" % pbooks, end="\r")
+            done_jobs = {PG_id for PG_id in validation_results if validation_results[PG_id] == 3}
             if not args.quiet:
-                print("# WARNING: cannot process '%s' (encoding not UTF-8)" % filename)
-        except KeyError:
+                print("%d books seem to be processed but have empty file(s)" %(len(validation_results) - len(done_jobs)))
+            del tmp0, tmp1, tmp2, validate_jobs0, validate_jobs1, validate_jobs2, validation_results
+
+    with eval("concurrent.futures.%sPoolExecutor()" % args.pool.capitalize()) as pool:
+        book_process_jobs = dict()
+        for filename in glob.glob(join(raw_dir, 'PG%s_raw.txt' % (args.pattern))):
+            # The process_books function will fail very rarely, whne
+            # a file tagged as UTf-8 is not really UTF-8. We kust
+            # skip those books.
+            # get PG_id
+            PG_id = os.path.split(filename)[-1].split("_")[0]
+
+            if PG_id not in done_jobs:
+                # get language from metadata
+                # default is english
+                language = "english"
+                try:
+                    # language is a string representing a list of languages codes
+                    lang_id = ast.literal_eval(metadata.loc[PG_id, "language"])[0]
+                    if lang_id in langs_dict.keys():
+                        language = langs_dict[lang_id]
+                except KeyError:
+                    if not args.quiet:
+                        msg = "# WARNING: metadata for '%s' not found" % filename
+                        print(msg)
+                        if args.log_file:
+                            with io.open(args.log_file, "a") as f:
+                                f.write(msg + '\n')
+
+                # process the book: strip headers, tokenize, count
+                book_process_jobs[
+                    pool.submit(
+                        process_book,
+                        path_to_raw_file=filename,
+                        text_dir=text_dir,
+                        tokens_dir=tokens_dir,
+                        counts_dir=counts_dir,
+                        overwrite_all=True,
+                        language=language,
+                        log_file=args.log_file,
+                        ignore=args.ignore)
+                ] = PG_id
+        
+            pbooks += 1
             if not args.quiet:
-                print("# WARNING: metadata for '%s' not found" % filename)
-        except Exception as e:
+                print("%d book processing jobs started..." % pbooks, end="\r")
+        
+        print("\n%d book processing jobs created in total" % len(book_process_jobs))
+
+        pbooks = 0
+        for job in concurrent.futures.as_completed(book_process_jobs):
+            PG_id = book_process_jobs[job]
+            if args.log_file:
+                try:
+                    log_content = job.result()
+                    with io.open(args.log_file, "a") as f:
+                        f.write(log_content)
+                except UnicodeDecodeError:
+                    if not args.quiet:
+                        print("# WARNING: cannot process '%s' (encoding not UTF-8)" % PG_id)
+                except LookupError as e:
+                    print("Very likely that an NLTK resource needs to be downloaded")
+                    raise e
+                except Exception as e:
+                    if not args.quiet:
+                        print("# WARNING: cannot process '%s' (unkown error)" % PG_id)
+                        raise e
+            pbooks += 1
             if not args.quiet:
-                print("# WARNING: cannot process '%s' (unkown error)" % filename)
+                print("Processed %d books..." % pbooks, end="\r")
+        print("\ndone")
diff --git a/src/bookshelves.py b/src/bookshelves.py
index 62e31aa..791b31c 100644
--- a/src/bookshelves.py
+++ b/src/bookshelves.py
@@ -7,7 +7,19 @@
 import pandas as pd
 import lxml.html
 import subprocess
+import shutil
+from .utils import is_win32
 
+def rm_dir(*args):
+    subprocess.call(["rm", "-rf"] + args)
+
+if is_win32:
+    rm_dir = shutil.rmtree
+    def rm_pattern(path):
+        for file in glob.glob(path.replace('/', os.path.sep)):
+            os.remove(file)
+else:
+    rm_pattern = rm_dir
 
 def get_bookshelves():
     """
@@ -19,8 +31,8 @@ def get_bookshelves():
 
     """
     sp_args = ["wget",
-               "--random-wait", "-r", 
-               "-p", "--no-parent", 
+               "--random-wait", "-r",
+               "--no-parent", 
                "-e", "robots=off", 
                "-U", "mozilla", 
                "https://www.gutenberg.org/ebooks/bookshelf/"
@@ -28,21 +40,25 @@ def get_bookshelves():
     subprocess.call(sp_args)
 
     # move it to metadata dir
-    sp_args = "mv www.gutenberg.org/ebooks/bookshelf/* metadata/bookshelves_html/"
-    subprocess.call(sp_args, shell=True)
+    if is_win32:
+        dst = "metadata/bookshelves_html/"
+        if not os.path.exists(dst):
+            os.mkdir(dst)
+        for src_path in glob.glob("www.gutenberg.org/ebooks/bookshelf/*"):
+            shutil.move(src_path, dst)
+    else:
+        sp_args = "mv www.gutenberg.org/ebooks/bookshelf/* metadata/bookshelves_html/"
+        subprocess.call(sp_args, shell=True)
 
     # cleanup
-    sp_args = ["rm", "-rf", "www.gutenberg.org"]
-    subprocess.call(sp_args)
+    if os.path.exists("www.gutenberg.org"):
+        rm_dir("www.gutenberg.org")
     # in the new version of the website and with these parameters of the wget (gets also other links within the crawled page)
     # we get also other files, copy of the bookshelves but with different ordering
     # remove them
-    sp_args = ["rm", "-rf", "metadata/bookshelves_html/*.opds*"]
-    subprocess.call(sp_args)
-    sp_args = ["rm", "-rf", "metadata/bookshelves_html/*?sort*"]
-    subprocess.call(sp_args)
-    sp_args = ["rm", "-rf", "metadata/bookshelves_html/*?start*"]
-    subprocess.call(sp_args)
+    rm_pattern("metadata/bookshelves_html/*.opds*")
+    rm_pattern("metadata/bookshelves_html/*?sort*")
+    rm_pattern("metadata/bookshelves_html/*?start*")
     return None
 
 def parse_bookshelves():
@@ -57,21 +73,23 @@ def parse_bookshelves():
     """
     # parse the data
     BS_paths = glob.glob("metadata/bookshelves_html/*")
-    BS = [path.split("/")[-1] for path in BS_paths]
+    # BS = [os.path.split(path)[-1] for path in BS_paths]
 
     BS_dict = {}
     BS_num_to_category_str_dict = {}
     for path in BS_paths:
-        bs = path.split("/")[-1]
+        _, bs = os.path.split(path)
+        PG_header = '' if bs == "index.html" else "PG"
         BS_dict[bs] = []
         with open(path, "r", encoding="UTF-8") as foo:
-            dom = lxml.html.fromstring(foo.read())
+            dom = lxml.html.parse(path)
             # select the url in href for all a tags(links)
             for link in dom.xpath('//a/@href'):
                 # links to ebooks that are not searches
                 if link.find("ebooks") > -1 and link.find("search") == -1:
-                    PGid = "PG"+link.split("/")[-1]
-                    BS_dict[bs].append(PGid)
+                    book_id = link.split("/")[-1]
+                    if book_id.isdigit():
+                        BS_dict[bs].append(PG_header + book_id)
             # get title of the category
             title_categories = dom.findall('.//title') # './/title' finds recursively the element with tag 'title'
             # check if there is only one title in the metadata of the category
diff --git a/src/cleanup.py b/src/cleanup.py
index 69967f6..90cb232 100644
--- a/src/cleanup.py
+++ b/src/cleanup.py
@@ -16,7 +16,7 @@ def cleanup(path, text_dir):
         Path to the PG****_raw.txt file
 
     """
-    PG_number = path.split("/")[-1].split("_")[0][2:]
+    PG_number = os.path.split(path)[-1].split("_")[0][2:]
     with io.open(path) as f:
         text = f.read()
 
diff --git a/src/pipeline.py b/src/pipeline.py
index 5e89c56..ccc86aa 100644
--- a/src/pipeline.py
+++ b/src/pipeline.py
@@ -14,7 +14,8 @@ def process_book(
 	cleanup_f=strip_headers,
     overwrite_all=False,
     language="english",
-    log_file=""
+    log_file="",
+    ignore=False
 	):
     """
     Process a book, from raw data to counts.
@@ -39,6 +40,9 @@ def process_book(
     ----------
     overwrite_all : bool
         If set to True, everything is processed regargless of existing files.
+    ignore : bool
+        If set to True, ignores UTF-8 decoding errors for "technically UTF-8" codecs
+        such as Windows-1252, enabling this shouldn't lead to the loss of any token
     """
     if text_dir is None:
         raise ValueError("You must specify a path to save the text files.")
@@ -53,14 +57,15 @@ def process_book(
         raise ValueError("You must specify a path to the raw file to process.")
    
     # get PG number
-    PG_number = path_to_raw_file.split("/")[-1].split("_")[0][2:]
+    PG_number = os.path.split(path_to_raw_file)[-1].split("_")[0][2:]
 
     if overwrite_all or\
         (not os.path.isfile(os.path.join(text_dir,"PG%s_text.txt"%PG_number))) or \
         (not os.path.isfile(os.path.join(tokens_dir,"PG%s_tokens.txt"%PG_number))) or \
         (not os.path.isfile(os.path.join(counts_dir,"PG%s_counts.txt"%PG_number))):
         # read raw file
-        with io.open(path_to_raw_file, encoding="UTF-8") as f:
+        with io.open(path_to_raw_file, encoding="UTF-8", 
+                     errors="ignore" if ignore else "strict") as f:
             text = f.read()
 
         # clean it up
@@ -93,6 +98,5 @@ def process_book(
             clean_nl = clean.count("\n")
             L = len(tokens)
             V = len(counts)
-            with io.open(log_file, "a") as f:
-               f.write("PG"+str(PG_number)+"\t"+language+"\t"+str(raw_nl)+"\t"+str(clean_nl)+"\t"+str(L)+"\t"+str(V)+"\n")
+            return "PG"+str(PG_number)+"\t"+language+"\t"+str(raw_nl)+"\t"+str(clean_nl)+"\t"+str(L)+"\t"+str(V)+"\n"
                 
diff --git a/src/tokenizer.py b/src/tokenizer.py
index b25f21e..873fb9c 100644
--- a/src/tokenizer.py
+++ b/src/tokenizer.py
@@ -3,9 +3,10 @@
    Call tokenize and pass a text (i.e. as a string).
    You will get a list of tokens
 """
+nltk_dir = "src/nltk_data"
 
 import nltk
-nltk.data.path=["src/nltk_data"]
+nltk.data.path=[nltk_dir]
 
 from nltk.tokenize.treebank import TreebankWordTokenizer
 from nltk.tokenize import sent_tokenize
@@ -42,7 +43,7 @@ def tokenize_text(text, language="english"):
 
 def filter_tokens(list_tokens):
     '''Remove un-wanted tokens from list of tokens
-    We only keep words that return TRUE for string.isaplha()
+    We only keep words that return TRUE for string.isalpha()
     We lowercase every token with string.lower()
     '''
     list_tokens_filter = [h.lower() for h in list_tokens if h.isalpha()]
diff --git a/src/utils.py b/src/utils.py
index ceea2ea..6b5a8cd 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -3,6 +3,11 @@
 import shutil
 import subprocess
 import glob
+import io
+from sys import platform
+
+# add support for windows
+is_win32 = platform.casefold() == "win32"
 
 def get_langs_dict():
     """
@@ -64,7 +69,7 @@ def list_duplicates_in_mirror(
     dups_list = []
     for dirName, subdirList, fileList in os.walk(mirror_dir):
         for matchpath in glob.iglob(os.path.join(dirName,"*-0.txt")):
-            fname = matchpath.split("/")[-1]
+            _, fname = os.path.split(matchpath)
             # fname must have exactly one "." and one "-"
             if (len(fname.split("."))==2 and len(fname.split("-"))==2):
                 PGnumber = get_PG_number(fname)
@@ -97,10 +102,24 @@ def populate_raw_from_mirror(mirror_dir=None,
         Files in this list are not copied into raw.
 
     """
-    for dirName, subdirList, fileList in os.walk(mirror_dir):
+    
+    # for non-Windows environments
+    def hard_link(src: str, tgt: str):
+        if (not os.path.isfile(tgt)) or overwrite:
+            subprocess.call(["ln", "-f", src, tgt])
+
+    # for Windows environments
+    if is_win32:
+        def win_hard_link(src: str, tgt: str):
+            if os.path.isfile(tgt) and overwrite:
+                subprocess.call(["del", tgt])
+            subprocess.call("mklink /H %s %s" % (tgt, src), shell=True)
+        hard_link = win_hard_link
+
+    for dirName, _, _ in os.walk(mirror_dir):
         # patterns to match are 12345-0.txt or pg12345.txt.utf8
         for matchpath in glob.iglob(os.path.join(dirName, "[p123456789][g0123456789][0-9]*")):
-            fname = matchpath.split("/")[-1]
+            _, fname = os.path.split(matchpath)
             # check that file is not in dups_list
             if matchpath not in dups_list:
                 # avoid files with more "." or "-" than expected
@@ -112,12 +131,38 @@ def populate_raw_from_mirror(mirror_dir=None,
                     source = os.path.join(dirName, fname)
                     target = os.path.join(raw_dir, "PG"+PGnumber+"_raw.txt")
 
-                    if (not os.path.isfile(target)) or overwrite:
-                        subprocess.call(["ln", "-f", source, target])
+                    hard_link(source, target)
 
             # if file was not in dupes list and we are not quiet
             elif not quiet:
                 print("# WARNING: file %s skipped due to duplication" % fname)
 
+def remove_empty_dirs(path: str, quiet: bool=False):
+    """
+    Removes empty directories in specified path
 
+    Parameters
+    ----------
+    path : str
+        the path to clean
+    quiet : bool
+        whether to notify the deletion
 
+    """
+    # Check if the given path is a directory
+    if not os.path.isdir(path):
+        print(f"Error: {path} is not a valid directory.")
+        return
+
+    # Recursively remove empty subdirectories
+    for dirName, subdirList, _ in os.walk(path, topdown=False):
+        for subdir in subdirList:
+            subdir_path = os.path.join(dirName, subdir)
+            if not os.listdir(subdir_path):  # Check if the directory is empty
+                os.rmdir(subdir_path)        # Remove the empty directory
+                if not quiet:
+                    print(f"Removed empty directory: {subdir_path}")
+
+def check_not_empty(fname: str) -> bool:
+    with io.open(fname, errors="ignore", encoding="utf-8") as f:
+        return bool(f.read().strip())