From 2e455e44d9523a2facc0318b8937c687209cec37 Mon Sep 17 00:00:00 2001 From: Hugo^3 Date: Thu, 28 Dec 2023 21:54:55 +0000 Subject: [PATCH 01/20] Added argument for customization of rsync command, changed rsync arguments to use proper formats --- .gitignore | 7 ++++++- get_data.py | 15 +++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 5f5032e..e5819c8 100644 --- a/.gitignore +++ b/.gitignore @@ -108,4 +108,9 @@ ENV/ .mypy_cache/ # VScode -.vscode/ \ No newline at end of file +.vscode/ + +# Windows dependencies / batch files +cwRsync*/* +*.exe +*.bat \ No newline at end of file diff --git a/get_data.py b/get_data.py index 5012e68..9600467 100644 --- a/get_data.py +++ b/get_data.py @@ -68,6 +68,13 @@ action="store_true", help="Quiet mode, do not print info, warnings, etc" ) + + # rsync command + parser.add_argument( + "--rsync", + help="Specify rsync command if not `rsync`", + default='rsync', + type=str) # create the parser args = parser.parse_args() @@ -99,10 +106,10 @@ # + 12345 - 0 . t x t #--------------------------------------------- # [.-][t0][x.]t[x.] * [t8] - sp_args = ["rsync", "-am%s" % vstring, - "--include", "*/", - "--include", "[p123456789][g0123456789]%s[.-][t0][x.]t[x.]*[t8]" % args.pattern, - "--exclude", "*", + sp_args = [args.rsync, "-am%s" % vstring, + "--include=*/", + "--include=[p123456789][g0123456789]%s[.-][t0][x.]t[x.]*[t8]" % args.pattern, + "--exclude=*", "aleph.gutenberg.org::gutenberg", args.mirror ] subprocess.call(sp_args) From e424d9536cf561c5e6eb48a4500bb37fab0a2f4b Mon Sep 17 00:00:00 2001 From: Hugo^3 Date: Sat, 30 Dec 2023 02:26:12 +0000 Subject: [PATCH 02/20] Added Win32 support and a function for cleaning empty directories --- src/utils.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/src/utils.py b/src/utils.py index ceea2ea..b8ed09d 100644 --- a/src/utils.py +++ b/src/utils.py @@ -3,6 +3,10 @@ import shutil import subprocess import glob +from sys import platform + +# add support for windows +is_win32 = platform.casefold() == "win32" def get_langs_dict(): """ @@ -64,7 +68,7 @@ def list_duplicates_in_mirror( dups_list = [] for dirName, subdirList, fileList in os.walk(mirror_dir): for matchpath in glob.iglob(os.path.join(dirName,"*-0.txt")): - fname = matchpath.split("/")[-1] + _, fname = os.path.split(matchpath) # fname must have exactly one "." and one "-" if (len(fname.split("."))==2 and len(fname.split("-"))==2): PGnumber = get_PG_number(fname) @@ -97,10 +101,24 @@ def populate_raw_from_mirror(mirror_dir=None, Files in this list are not copied into raw. """ - for dirName, subdirList, fileList in os.walk(mirror_dir): + + # for non-Windows environments + def hard_link(src: str, tgt: str): + if (not os.path.isfile(tgt)) or overwrite: + subprocess.call(["ln", "-f", src, tgt]) + + # for Windows environments + if is_win32: + def win_hard_link(src: str, tgt: str): + if os.path.isfile(tgt) and overwrite: + subprocess.call(["del", tgt]) + subprocess.call("mklink /H %s %s" % (tgt, src), shell=True) + hard_link = win_hard_link + + for dirName, _, _ in os.walk(mirror_dir): # patterns to match are 12345-0.txt or pg12345.txt.utf8 for matchpath in glob.iglob(os.path.join(dirName, "[p123456789][g0123456789][0-9]*")): - fname = matchpath.split("/")[-1] + _, fname = os.path.split(matchpath) # check that file is not in dups_list if matchpath not in dups_list: # avoid files with more "." or "-" than expected @@ -112,12 +130,36 @@ def populate_raw_from_mirror(mirror_dir=None, source = os.path.join(dirName, fname) target = os.path.join(raw_dir, "PG"+PGnumber+"_raw.txt") - if (not os.path.isfile(target)) or overwrite: - subprocess.call(["ln", "-f", source, target]) + hard_link(source, target) # if file was not in dupes list and we are not quiet elif not quiet: print("# WARNING: file %s skipped due to duplication" % fname) +def remove_empty_dirs(path: str, quiet: bool=False): + """ + Removes empty directories in specified path + + Parameters + ---------- + path : str + the path to clean + quiet : bool + whether to notify the deletion + + """ + # Check if the given path is a directory + if not os.path.isdir(path): + print(f"Error: {path} is not a valid directory.") + return + + # Recursively remove empty subdirectories + for dirName, subdirList, _ in os.walk(path, topdown=False): + for subdir in subdirList: + subdir_path = os.path.join(dirName, subdir) + if not os.listdir(subdir_path): # Check if the directory is empty + os.rmdir(subdir_path) # Remove the empty directory + if not quiet: + print(f"Removed empty directory: {subdir_path}") From eabe9810ce583dfdf3ba70e8912e7cfce0f6918f Mon Sep 17 00:00:00 2001 From: Hugo^3 Date: Sat, 30 Dec 2023 02:28:11 +0000 Subject: [PATCH 03/20] Fixed typo, exposing nltk_data dir as string variable --- src/tokenizer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/tokenizer.py b/src/tokenizer.py index b25f21e..873fb9c 100644 --- a/src/tokenizer.py +++ b/src/tokenizer.py @@ -3,9 +3,10 @@ Call tokenize and pass a text (i.e. as a string). You will get a list of tokens """ +nltk_dir = "src/nltk_data" import nltk -nltk.data.path=["src/nltk_data"] +nltk.data.path=[nltk_dir] from nltk.tokenize.treebank import TreebankWordTokenizer from nltk.tokenize import sent_tokenize @@ -42,7 +43,7 @@ def tokenize_text(text, language="english"): def filter_tokens(list_tokens): '''Remove un-wanted tokens from list of tokens - We only keep words that return TRUE for string.isaplha() + We only keep words that return TRUE for string.isalpha() We lowercase every token with string.lower() ''' list_tokens_filter = [h.lower() for h in list_tokens if h.isalpha()] From f759ae0187d2e2cab56c3abaa6785480be85cc3d Mon Sep 17 00:00:00 2001 From: Hugo^3 Date: Sat, 30 Dec 2023 02:30:50 +0000 Subject: [PATCH 04/20] Used OS-independent path-parsing, changed behavior of process_book function to return logging message instead of appending to log file directly --- src/pipeline.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/pipeline.py b/src/pipeline.py index 5e89c56..3d290ba 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -53,7 +53,7 @@ def process_book( raise ValueError("You must specify a path to the raw file to process.") # get PG number - PG_number = path_to_raw_file.split("/")[-1].split("_")[0][2:] + PG_number = os.path.split(path_to_raw_file)[-1].split("_")[0][2:] if overwrite_all or\ (not os.path.isfile(os.path.join(text_dir,"PG%s_text.txt"%PG_number))) or \ @@ -93,6 +93,5 @@ def process_book( clean_nl = clean.count("\n") L = len(tokens) V = len(counts) - with io.open(log_file, "a") as f: - f.write("PG"+str(PG_number)+"\t"+language+"\t"+str(raw_nl)+"\t"+str(clean_nl)+"\t"+str(L)+"\t"+str(V)+"\n") + return "PG"+str(PG_number)+"\t"+language+"\t"+str(raw_nl)+"\t"+str(clean_nl)+"\t"+str(L)+"\t"+str(V)+"\n" From 5e3ea272281f1612fdd982b5f06076d3767693b1 Mon Sep 17 00:00:00 2001 From: Hugo^3 Date: Sat, 30 Dec 2023 02:32:05 +0000 Subject: [PATCH 05/20] Used OS-independent path-parsing --- src/cleanup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cleanup.py b/src/cleanup.py index 69967f6..90cb232 100644 --- a/src/cleanup.py +++ b/src/cleanup.py @@ -16,7 +16,7 @@ def cleanup(path, text_dir): Path to the PG****_raw.txt file """ - PG_number = path.split("/")[-1].split("_")[0][2:] + PG_number = os.path.split(path)[-1].split("_")[0][2:] with io.open(path) as f: text = f.read() From be5cb201fdb5e59dfc5bbcbff864413e75d3ff29 Mon Sep 17 00:00:00 2001 From: Hugo^3 Date: Sat, 30 Dec 2023 02:36:06 +0000 Subject: [PATCH 06/20] Added Win32 support --- src/bookshelves.py | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/src/bookshelves.py b/src/bookshelves.py index 62e31aa..7ed8b2f 100644 --- a/src/bookshelves.py +++ b/src/bookshelves.py @@ -7,7 +7,19 @@ import pandas as pd import lxml.html import subprocess +import shutil +from .utils import is_win32 +def rm_dir(*args): + subprocess.call(["rm", "-rf"] + args) + +rm_pattern = rm_dir + +if is_win32: + rm_dir = os.rmdir + def rm_pattern(path): + for file in glob.glob(path): + os.remove(file) def get_bookshelves(): """ @@ -28,21 +40,24 @@ def get_bookshelves(): subprocess.call(sp_args) # move it to metadata dir - sp_args = "mv www.gutenberg.org/ebooks/bookshelf/* metadata/bookshelves_html/" - subprocess.call(sp_args, shell=True) + if is_win32: + dst = "metadata/bookshelves_html/" + if not os.path.exists(dst): + os.mkdir(dst) + for src_path in glob.glob("www.gutenberg.org/ebooks/bookshelf/*"): + shutil.move(src_path, dst) + else: + sp_args = "mv www.gutenberg.org/ebooks/bookshelf/* metadata/bookshelves_html/" + subprocess.call(sp_args, shell=True) # cleanup - sp_args = ["rm", "-rf", "www.gutenberg.org"] - subprocess.call(sp_args) + rm_dir("www.gutenberg.org") # in the new version of the website and with these parameters of the wget (gets also other links within the crawled page) # we get also other files, copy of the bookshelves but with different ordering # remove them - sp_args = ["rm", "-rf", "metadata/bookshelves_html/*.opds*"] - subprocess.call(sp_args) - sp_args = ["rm", "-rf", "metadata/bookshelves_html/*?sort*"] - subprocess.call(sp_args) - sp_args = ["rm", "-rf", "metadata/bookshelves_html/*?start*"] - subprocess.call(sp_args) + rm_pattern("metadata/bookshelves_html/*.opds*") + rm_pattern("metadata/bookshelves_html/*?sort*") + rm_pattern("metadata/bookshelves_html/*?start*") return None def parse_bookshelves(): @@ -57,12 +72,12 @@ def parse_bookshelves(): """ # parse the data BS_paths = glob.glob("metadata/bookshelves_html/*") - BS = [path.split("/")[-1] for path in BS_paths] + # BS = [os.path.split(path)[-1] for path in BS_paths] BS_dict = {} BS_num_to_category_str_dict = {} for path in BS_paths: - bs = path.split("/")[-1] + _, bs = os.path.split(path) BS_dict[bs] = [] with open(path, "r", encoding="UTF-8") as foo: dom = lxml.html.fromstring(foo.read()) From 0e6404d29abbc0b3769027bb99c7031f2e9cc699 Mon Sep 17 00:00:00 2001 From: Hugo^3 Date: Sat, 30 Dec 2023 02:39:24 +0000 Subject: [PATCH 07/20] Added Win32 support and freedom to specify the stages to go through via an optional `--procedures` argument --- get_data.py | 92 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 64 insertions(+), 28 deletions(-) diff --git a/get_data.py b/get_data.py index 9600467..5cdc3f0 100644 --- a/get_data.py +++ b/get_data.py @@ -5,7 +5,7 @@ M. Gerlach & F. Font-Clos """ -from src.utils import populate_raw_from_mirror, list_duplicates_in_mirror +from src.utils import populate_raw_from_mirror, list_duplicates_in_mirror, remove_empty_dirs, is_win32 from src.metadataparser import make_df_metadata from src.bookshelves import get_bookshelves from src.bookshelves import parse_bookshelves @@ -22,6 +22,7 @@ "This script will download all books currently not in your\n" "local copy of PG and get the latest version of the metadata.\n" ) + # mirror dir parser.add_argument( "-m", "--mirror", @@ -69,22 +70,48 @@ help="Quiet mode, do not print info, warnings, etc" ) + # clean argument, to supress info + parser.add_argument( + "-c", "--clean", + action="store_true", + help="Clean the mirror directory to remove any empty folders" + ) + # rsync command parser.add_argument( "--rsync", - help="Specify rsync command if not `rsync`", + help="Specify an alternative rsync command", default='rsync', type=str) + + # rsync command + parser.add_argument( + "--procedures", + help='''Procedures to go through, defaults to \"pdlmb\": + - [p]ull mirror files + - find [d]uplicates + - hard [l]ink from mirror to raw + - get [m]etadata + - get [b]ookshelf information''', + default='pdlmb', + type=str) # create the parser args = parser.parse_args() + mirror_dir, raw_dir, metadata_dir = args.mirror, args.raw, args.metadata + + if is_win32: + print("Windows detected, please make sure wget is installed and added to PATH") + mirror_dir = mirror_dir.replace('/', '\\') + raw_dir = raw_dir.replace('/', '\\') + metadata_dir = metadata_dir.replace('/', '\\') # check that all dirs exist - if not os.path.isdir(args.mirror): + if not os.path.isdir(mirror_dir): raise ValueError("The specified mirror directory does not exist.") - if not os.path.isdir(args.raw): + if not os.path.isdir(raw_dir): raise ValueError("The specified raw directory does not exist.") - if not os.path.isdir(args.metadata): + if not os.path.isdir(metadata_dir): raise ValueError("The specified metadata directory does not exist.") # Update the .mirror directory via rsync @@ -106,49 +133,58 @@ # + 12345 - 0 . t x t #--------------------------------------------- # [.-][t0][x.]t[x.] * [t8] - sp_args = [args.rsync, "-am%s" % vstring, - "--include=*/", - "--include=[p123456789][g0123456789]%s[.-][t0][x.]t[x.]*[t8]" % args.pattern, - "--exclude=*", - "aleph.gutenberg.org::gutenberg", args.mirror - ] - subprocess.call(sp_args) + includes = ["*/", "[p123456789][g0123456789]%s[.-][t0][x.]t[x.]*[t8]" % args.pattern] + excludes = ["*"] + sp_args = ' '.join([args.rsync, "-am%s" % vstring] + ["--include=\"%s\"" % i for i in includes] + \ + ["--exclude=\"%s\"" % i for i in excludes] + ["aleph.gutenberg.org::gutenberg", mirror_dir]) + + # If specified, remove any empty directory that might be caused by bugs or wrong patterns in rsync + if args.clean: + remove_empty_dirs(mirror_dir, args.quiet) + + # Subprocess call (default arguments): + # rsync -amv --include="*/" --include="[p123456789][g0123456789]*[.-][t0][x.]t[x.]*[t8]" --exclude="*" aleph.gutenberg.org::gutenberg data/.mirror/ + if 'p' in args.procedures: + subprocess.call(sp_args) # Get rid of duplicates # --------------------- # A very small portion of books are stored more than # once in PG's site. We keep the newest one, see # erase_duplicates_in_mirror docstring. - dups_list = list_duplicates_in_mirror(mirror_dir=args.mirror) + dups_list = list_duplicates_in_mirror(mirror_dir=mirror_dir) if 'd' in args.procedures else [] # Populate raw from mirror # ------------------------ # We populate 'raw_dir' hardlinking to # the hidden 'mirror_dir'. Names are standarized # into PG12345_raw.txt form. - populate_raw_from_mirror( - mirror_dir=args.mirror, - raw_dir=args.raw, - overwrite=args.overwrite_raw, - dups_list=dups_list, - quiet=args.quiet + if 'l' in args.procedures: + populate_raw_from_mirror( + mirror_dir=mirror_dir, + raw_dir=raw_dir, + overwrite=args.overwrite_raw, + dups_list=dups_list, + quiet=args.quiet ) # Update metadata # --------------- # By default, update the whole metadata csv # file each time new data is downloaded. - make_df_metadata( - path_xml=os.path.join(args.metadata, 'rdf-files.tar.bz2'), - path_out=os.path.join(args.metadata, 'metadata.csv'), - update=args.keep_rdf + if 'm' in args.procedures: + make_df_metadata( + path_xml=os.path.join(metadata_dir, 'rdf-files.tar.bz2'), + path_out=os.path.join(metadata_dir, 'metadata.csv'), + update=args.keep_rdf ) # Bookshelves # ----------- # Get bookshelves and their respective books and titles as dicts - BS_dict, BS_num_to_category_str_dict = parse_bookshelves() - with open("metadata/bookshelves_ebooks_dict.pkl", 'wb') as fp: - pickle.dump(BS_dict, fp) - with open("metadata/bookshelves_categories_dict.pkl", 'wb') as fp: - pickle.dump(BS_num_to_category_str_dict, fp) \ No newline at end of file + if 'b' in args.procedures: + BS_dict, BS_num_to_category_str_dict = parse_bookshelves() + with open("metadata/bookshelves_ebooks_dict.pkl", 'wb') as fp: + pickle.dump(BS_dict, fp) + with open("metadata/bookshelves_categories_dict.pkl", 'wb') as fp: + pickle.dump(BS_num_to_category_str_dict, fp) \ No newline at end of file From 91ef9c0621828c52677f88797908eb5d6895ddbd Mon Sep 17 00:00:00 2001 From: Hugo^3 Date: Sat, 30 Dec 2023 02:41:40 +0000 Subject: [PATCH 08/20] Fixed a typo and an oversight regarding nltk data download, more customization and multi-threading/processing support in progress --- process_data.py | 91 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 62 insertions(+), 29 deletions(-) diff --git a/process_data.py b/process_data.py index e21b92f..1bbbfa3 100644 --- a/process_data.py +++ b/process_data.py @@ -11,12 +11,17 @@ import glob import ast import pandas as pd +import concurrent.futures +import io from src.pipeline import process_book -from src.utils import get_langs_dict +from src.utils import get_langs_dict, is_win32 if __name__ == '__main__': + from src.tokenizer import nltk_dir + import nltk + nltk.download("punkt", nltk_dir) # avoid lookup error parser = argparse.ArgumentParser( "Processing raw texts from Project Gutenberg:" @@ -48,7 +53,7 @@ # pattern to specify subset of books parser.add_argument( "-p", "--pattern", - help="Patttern to specify a subset of books", + help="Pattern to specify a subset of books", default='*', type=str) @@ -68,17 +73,25 @@ # add arguments to parser args = parser.parse_args() + raw_dir, text_dir, tokens_dir, counts_dir = args.raw, args.output_text, args.output_tokens, args.output_counts + + if is_win32: + print("Windows detected") + raw_dir = raw_dir.replace('/', '\\') + text_dir = text_dir.replace('/', '\\') + tokens_dir = tokens_dir.replace('/', '\\') + counts_dir = counts_dir.replace('/', '\\') # check whether the out-put directories exist - if os.path.isdir(args.output_text) is False: + if os.path.isdir(text_dir) is False: raise ValueError("The directory for output of texts '%s' " - "does not exist" % (args.output_text)) - if os.path.isdir(args.output_tokens) is False: + "does not exist" % (text_dir)) + if os.path.isdir(tokens_dir) is False: raise ValueError("The directory for output of tokens '%s' " - "does not exist" % (args.output_tokens)) - if os.path.isdir(args.output_counts) is False: + "does not exist" % (tokens_dir)) + if os.path.isdir(counts_dir) is False: raise ValueError("The directory for output of counts '%s' " - "does not exist" % (args.output_counts)) + "does not exist" % (counts_dir)) # load metadata metadata = pd.read_csv("metadata/metadata.csv").set_index("id") @@ -88,13 +101,15 @@ # loop over all books in the raw-folder pbooks = 0 - for filename in glob.glob(join(args.raw, 'PG%s_raw.txt' % (args.pattern))): - # The process_books function will fail very rarely, whne - # a file tagged as UTf-8 is not really UTF-8. We kust - # skip those books. - try: + + with concurrent.futures.ProcessPoolExecutor() as pool: + book_process_jobs = [] + for filename in glob.glob(join(raw_dir, 'PG%s_raw.txt' % (args.pattern))): + # The process_books function will fail very rarely, whne + # a file tagged as UTf-8 is not really UTF-8. We kust + # skip those books. # get PG_id - PG_id = filename.split("/")[-1].split("_")[0] + PG_id = os.path.split(filename)[-1].split("_")[0] # get language from metadata # default is english @@ -105,23 +120,41 @@ language = langs_dict[lang_id] # process the book: strip headers, tokenize, count - process_book( + book_process_jobs.append(pool.submit( + process_book, path_to_raw_file=filename, - text_dir=args.output_text, - tokens_dir=args.output_tokens, - counts_dir=args.output_counts, + text_dir=text_dir, + tokens_dir=tokens_dir, + counts_dir=counts_dir, language=language, - log_file=args.log_file - ) + log_file=args.log_file)) + pbooks += 1 if not args.quiet: - print("Processed %d books..." % pbooks, end="\r") - except UnicodeDecodeError: - if not args.quiet: - print("# WARNING: cannot process '%s' (encoding not UTF-8)" % filename) - except KeyError: - if not args.quiet: - print("# WARNING: metadata for '%s' not found" % filename) - except Exception as e: + print("%d book processing jobs started..." % pbooks, end="\r") + + print() + pbooks = 0 + for job in concurrent.futures.as_completed(book_process_jobs): + if args.log_file: + try: + log_content = job.result() + with io.open(args.log_file, "a") as f: + f.write(log_content) + except UnicodeDecodeError: + if not args.quiet: + print("# WARNING: cannot process '%s' (encoding not UTF-8)" % filename) + except KeyError: + if not args.quiet: + print("# WARNING: metadata for '%s' not found" % filename) + except LookupError as e: + print("Very likely that an NLTK resource needs to be downloaded") + raise e + except Exception as e: + if not args.quiet: + print("# WARNING: cannot process '%s' (unkown error)" % filename) + raise e + pbooks += 1 if not args.quiet: - print("# WARNING: cannot process '%s' (unkown error)" % filename) + print("Processed %d books..." % pbooks, end="\r") + From d986607cf947dd53624531bed46fce962452d9b9 Mon Sep 17 00:00:00 2001 From: Hugo^3 Date: Sat, 30 Dec 2023 02:51:28 +0000 Subject: [PATCH 09/20] modified: .gitignore --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index e5819c8..df32ff6 100644 --- a/.gitignore +++ b/.gitignore @@ -113,4 +113,7 @@ ENV/ # Windows dependencies / batch files cwRsync*/* *.exe -*.bat \ No newline at end of file +*.bat + +# nltk data directory +src/nltk_data/** From cb4f58b0c62ecc8ee30fbbeb532869b2c474a7d1 Mon Sep 17 00:00:00 2001 From: Hugo^3 Date: Sat, 30 Dec 2023 14:51:55 +0000 Subject: [PATCH 10/20] Added option to ignore UTF-8 decoding failures for "technically UTF-8" codecs such as Windows-1252 --- src/pipeline.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/pipeline.py b/src/pipeline.py index 3d290ba..ccc86aa 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -14,7 +14,8 @@ def process_book( cleanup_f=strip_headers, overwrite_all=False, language="english", - log_file="" + log_file="", + ignore=False ): """ Process a book, from raw data to counts. @@ -39,6 +40,9 @@ def process_book( ---------- overwrite_all : bool If set to True, everything is processed regargless of existing files. + ignore : bool + If set to True, ignores UTF-8 decoding errors for "technically UTF-8" codecs + such as Windows-1252, enabling this shouldn't lead to the loss of any token """ if text_dir is None: raise ValueError("You must specify a path to save the text files.") @@ -60,7 +64,8 @@ def process_book( (not os.path.isfile(os.path.join(tokens_dir,"PG%s_tokens.txt"%PG_number))) or \ (not os.path.isfile(os.path.join(counts_dir,"PG%s_counts.txt"%PG_number))): # read raw file - with io.open(path_to_raw_file, encoding="UTF-8") as f: + with io.open(path_to_raw_file, encoding="UTF-8", + errors="ignore" if ignore else "strict") as f: text = f.read() # clean it up From f69409559656e944ac368cd41acbaafecdbaee2e Mon Sep 17 00:00:00 2001 From: Hugo^3 Date: Sat, 30 Dec 2023 14:58:01 +0000 Subject: [PATCH 11/20] Added detection for books already processed, argument for specifying multi-threading or processing, argument for ignoring UTF-8 decoding failures --- process_data.py | 99 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 71 insertions(+), 28 deletions(-) diff --git a/process_data.py b/process_data.py index 1bbbfa3..566f909 100644 --- a/process_data.py +++ b/process_data.py @@ -13,6 +13,7 @@ import pandas as pd import concurrent.futures import io +import re from src.pipeline import process_book from src.utils import get_langs_dict, is_win32 @@ -70,6 +71,20 @@ help="Path to log file", default=".log", type=str) + + # whether to ignore UTF-8 decoding errors + parser.add_argument( + "--ignore", + action="store_true", + help="Whether to ignore UTF-8 decoding errors") + + # multi-threading/processing choice + parser.add_argument( + "--pool", + help="Whether to use multi-processing or multi-threading", + default="process", + choices=["process", "thread"], + type=str) # add arguments to parser args = parser.parse_args() @@ -101,9 +116,25 @@ # loop over all books in the raw-folder pbooks = 0 - - with concurrent.futures.ProcessPoolExecutor() as pool: - book_process_jobs = [] + + # find out which jobs were already done + re_pattern = args.pattern.replace('*', '.*') # wild card roughly equals .* in regex + pattern_text = re.compile('(PG%s)_text.txt' % (re_pattern)) + pattern_tokens = re.compile('(PG%s)_tokens.txt' % (re_pattern)) + pattern_counts = re.compile('(PG%s)_counts.txt' % (re_pattern)) + exist_text = {pattern_text.fullmatch(f) for f in + glob.glob('PG%s_text.txt' % (args.pattern), root_dir=text_dir)} + exist_tokens = {pattern_tokens.fullmatch(f) for f in + glob.glob('PG%s_tokens.txt' % (args.pattern), root_dir=tokens_dir)} + exist_counts = {pattern_counts.fullmatch(f) for f in + glob.glob('PG%s_counts.txt' % (args.pattern), root_dir=counts_dir)} + exist_text = {f.group(1) for f in exist_text if f} + exist_tokens = {f.group(1) for f in exist_tokens if f} + exist_counts = {f.group(1) for f in exist_counts if f} + done_jobs = exist_text & exist_tokens & exist_counts + + with eval("concurrent.futures.%sPoolExecutor()" % args.pool.capitalize()) as pool: + book_process_jobs = dict() for filename in glob.glob(join(raw_dir, 'PG%s_raw.txt' % (args.pattern))): # The process_books function will fail very rarely, whne # a file tagged as UTf-8 is not really UTF-8. We kust @@ -111,31 +142,46 @@ # get PG_id PG_id = os.path.split(filename)[-1].split("_")[0] - # get language from metadata - # default is english - language = "english" - # language is a string representing a list of languages codes - lang_id = ast.literal_eval(metadata.loc[PG_id, "language"])[0] - if lang_id in langs_dict.keys(): - language = langs_dict[lang_id] - - # process the book: strip headers, tokenize, count - book_process_jobs.append(pool.submit( - process_book, - path_to_raw_file=filename, - text_dir=text_dir, - tokens_dir=tokens_dir, - counts_dir=counts_dir, - language=language, - log_file=args.log_file)) - + if PG_id not in done_jobs: + # get language from metadata + # default is english + language = "english" + try: + # language is a string representing a list of languages codes + lang_id = ast.literal_eval(metadata.loc[PG_id, "language"])[0] + if lang_id in langs_dict.keys(): + language = langs_dict[lang_id] + except KeyError: + if not args.quiet: + msg = "# WARNING: metadata for '%s' not found" % filename + print(msg) + if args.log_file: + with io.open(args.log_file, "a") as f: + f.write(msg + '\n') + + # process the book: strip headers, tokenize, count + book_process_jobs[ + pool.submit( + process_book, + path_to_raw_file=filename, + text_dir=text_dir, + tokens_dir=tokens_dir, + counts_dir=counts_dir, + overwrite_all=True, + language=language, + log_file=args.log_file, + ignore=args.ignore) + ] = PG_id + pbooks += 1 if not args.quiet: print("%d book processing jobs started..." % pbooks, end="\r") - print() + print("\n%d book processing jobs created in total" % len(book_process_jobs)) + pbooks = 0 for job in concurrent.futures.as_completed(book_process_jobs): + PG_id = book_process_jobs[job] if args.log_file: try: log_content = job.result() @@ -143,18 +189,15 @@ f.write(log_content) except UnicodeDecodeError: if not args.quiet: - print("# WARNING: cannot process '%s' (encoding not UTF-8)" % filename) - except KeyError: - if not args.quiet: - print("# WARNING: metadata for '%s' not found" % filename) + print("# WARNING: cannot process '%s' (encoding not UTF-8)" % PG_id) except LookupError as e: print("Very likely that an NLTK resource needs to be downloaded") raise e except Exception as e: if not args.quiet: - print("# WARNING: cannot process '%s' (unkown error)" % filename) + print("# WARNING: cannot process '%s' (unkown error)" % PG_id) raise e pbooks += 1 if not args.quiet: print("Processed %d books..." % pbooks, end="\r") - + print("\ndone") From 1a06f5306c2dd7aa2984ea91604297f5af5b537d Mon Sep 17 00:00:00 2001 From: Hugo^3 Date: Thu, 4 Jan 2024 17:05:46 +0000 Subject: [PATCH 12/20] Added in the missing `get_bookshelves()` call in get_data.py and an utility function for checking if a file is empty --- get_data.py | 1 + src/utils.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/get_data.py b/get_data.py index 5cdc3f0..11354dc 100644 --- a/get_data.py +++ b/get_data.py @@ -183,6 +183,7 @@ # ----------- # Get bookshelves and their respective books and titles as dicts if 'b' in args.procedures: + get_bookshelves() BS_dict, BS_num_to_category_str_dict = parse_bookshelves() with open("metadata/bookshelves_ebooks_dict.pkl", 'wb') as fp: pickle.dump(BS_dict, fp) diff --git a/src/utils.py b/src/utils.py index b8ed09d..6b5a8cd 100644 --- a/src/utils.py +++ b/src/utils.py @@ -3,6 +3,7 @@ import shutil import subprocess import glob +import io from sys import platform # add support for windows @@ -162,4 +163,6 @@ def remove_empty_dirs(path: str, quiet: bool=False): if not quiet: print(f"Removed empty directory: {subdir_path}") - +def check_not_empty(fname: str) -> bool: + with io.open(fname, errors="ignore", encoding="utf-8") as f: + return bool(f.read().strip()) From cfef209b66aaa32dc685dc28e102c314dc7b787b Mon Sep 17 00:00:00 2001 From: Hugo^3 Date: Thu, 4 Jan 2024 17:19:24 +0000 Subject: [PATCH 13/20] Added an option to check if any of the resultant files are empty before assuming a book is "done" --- .gitignore | 6 ++++++ process_data.py | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index df32ff6..639fa2c 100644 --- a/.gitignore +++ b/.gitignore @@ -117,3 +117,9 @@ cwRsync*/* # nltk data directory src/nltk_data/** + +# Wget temporary directory +*gutenberg*/ + +# Jupyter notebooks for processing data +*.ipynb \ No newline at end of file diff --git a/process_data.py b/process_data.py index 566f909..bdfc75a 100644 --- a/process_data.py +++ b/process_data.py @@ -16,7 +16,7 @@ import re from src.pipeline import process_book -from src.utils import get_langs_dict, is_win32 +from src.utils import get_langs_dict, check_not_empty, is_win32 if __name__ == '__main__': @@ -72,6 +72,12 @@ default=".log", type=str) + # check if existing files are empty + parser.add_argument( + "-c", "--check_empty", + action="store_true", + help="Whether to check if existing files are empty") + # whether to ignore UTF-8 decoding errors parser.add_argument( "--ignore", @@ -128,10 +134,36 @@ glob.glob('PG%s_tokens.txt' % (args.pattern), root_dir=tokens_dir)} exist_counts = {pattern_counts.fullmatch(f) for f in glob.glob('PG%s_counts.txt' % (args.pattern), root_dir=counts_dir)} + exist_text = {f.group(1) for f in exist_text if f} exist_tokens = {f.group(1) for f in exist_tokens if f} exist_counts = {f.group(1) for f in exist_counts if f} done_jobs = exist_text & exist_tokens & exist_counts + del exist_text, exist_tokens, exist_counts + + if args.check_empty: + with eval("concurrent.futures.%sPoolExecutor()" % args.pool.capitalize()) as pool: + tmp0 = [(os.path.join(text_dir, PG_id) + "_text.txt", PG_id) for PG_id in done_jobs] + tmp1 = [(os.path.join(tokens_dir, PG_id) + "_tokens.txt", PG_id) for PG_id in done_jobs] + tmp2 = [(os.path.join(counts_dir, PG_id) + "_counts.txt", PG_id) for PG_id in done_jobs] + validate_jobs0 = {pool.submit(check_not_empty, f) : PG_id for f, PG_id in tmp0} + validate_jobs1 = {pool.submit(check_not_empty, f) : PG_id for f, PG_id in tmp1} + validate_jobs2 = {pool.submit(check_not_empty, f) : PG_id for f, PG_id in tmp2} + validation_results = {PG_id : 0 for PG_id in done_jobs} + if not args.quiet: + print("%d books to check for completion (3 passes required)" % len(done_jobs)) + for job_type in [validate_jobs0, validate_jobs1, validate_jobs2]: + pbooks = 0 + for job in concurrent.futures.as_completed(job_type): + if job.result(): + validation_results[job_type[job]] += 1 + pbooks += 1 + if (not args.quiet) and (pbooks % 100 == 0): + print("%6d books checked for completion" % pbooks, end="\r") + done_jobs = {PG_id for PG_id in validation_results if validation_results[PG_id] == 3} + if not args.quiet: + print("%d books seem to be processed but have empty file(s)" %(len(validation_results) - len(done_jobs))) + del tmp0, tmp1, tmp2, validate_jobs0, validate_jobs1, validate_jobs2, validation_results with eval("concurrent.futures.%sPoolExecutor()" % args.pool.capitalize()) as pool: book_process_jobs = dict() From c183413d6621d4a921814fd3c20f61c716f04166 Mon Sep 17 00:00:00 2001 From: Hugo^3 Date: Fri, 5 Jan 2024 01:07:39 +0000 Subject: [PATCH 14/20] Fixed several bugs in bookshelf-related code: - used `shutil.rmtree` instead of `os.rmdir` since latter is only for empty dir and added check for existence of dir to remove (win32) - removed `-p` option when calling wget to avoid downloading large amount of useless data (original code) - filtered out many garbage data (non-book weblinks) in bookshelves dicts and removed "PG" prefix for values of index.html in bookshelves_ebooks_dict.pkl (original code) --- src/bookshelves.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/bookshelves.py b/src/bookshelves.py index 7ed8b2f..791b31c 100644 --- a/src/bookshelves.py +++ b/src/bookshelves.py @@ -13,13 +13,13 @@ def rm_dir(*args): subprocess.call(["rm", "-rf"] + args) -rm_pattern = rm_dir - if is_win32: - rm_dir = os.rmdir + rm_dir = shutil.rmtree def rm_pattern(path): - for file in glob.glob(path): + for file in glob.glob(path.replace('/', os.path.sep)): os.remove(file) +else: + rm_pattern = rm_dir def get_bookshelves(): """ @@ -31,8 +31,8 @@ def get_bookshelves(): """ sp_args = ["wget", - "--random-wait", "-r", - "-p", "--no-parent", + "--random-wait", "-r", + "--no-parent", "-e", "robots=off", "-U", "mozilla", "https://www.gutenberg.org/ebooks/bookshelf/" @@ -51,7 +51,8 @@ def get_bookshelves(): subprocess.call(sp_args, shell=True) # cleanup - rm_dir("www.gutenberg.org") + if os.path.exists("www.gutenberg.org"): + rm_dir("www.gutenberg.org") # in the new version of the website and with these parameters of the wget (gets also other links within the crawled page) # we get also other files, copy of the bookshelves but with different ordering # remove them @@ -78,15 +79,17 @@ def parse_bookshelves(): BS_num_to_category_str_dict = {} for path in BS_paths: _, bs = os.path.split(path) + PG_header = '' if bs == "index.html" else "PG" BS_dict[bs] = [] with open(path, "r", encoding="UTF-8") as foo: - dom = lxml.html.fromstring(foo.read()) + dom = lxml.html.parse(path) # select the url in href for all a tags(links) for link in dom.xpath('//a/@href'): # links to ebooks that are not searches if link.find("ebooks") > -1 and link.find("search") == -1: - PGid = "PG"+link.split("/")[-1] - BS_dict[bs].append(PGid) + book_id = link.split("/")[-1] + if book_id.isdigit(): + BS_dict[bs].append(PG_header + book_id) # get title of the category title_categories = dom.findall('.//title') # './/title' finds recursively the element with tag 'title' # check if there is only one title in the metadata of the category From 93208e6c421ae92214b5031889f5baf6ca91d331 Mon Sep 17 00:00:00 2001 From: Hugo^3 Date: Fri, 5 Jan 2024 01:10:57 +0000 Subject: [PATCH 15/20] Further extended the procedures option to allow for parsing/saving of bookshelves info without running time-consuming Wget --- get_data.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/get_data.py b/get_data.py index 11354dc..c39810e 100644 --- a/get_data.py +++ b/get_data.py @@ -92,8 +92,9 @@ - find [d]uplicates - hard [l]ink from mirror to raw - get [m]etadata - - get [b]ookshelf information''', - default='pdlmb', + - get [b]ookshelf information + - [s]tore bookshelf information''', + default='pdlmbs', type=str) # create the parser @@ -184,6 +185,8 @@ # Get bookshelves and their respective books and titles as dicts if 'b' in args.procedures: get_bookshelves() + + if 's' in args.procedures: BS_dict, BS_num_to_category_str_dict = parse_bookshelves() with open("metadata/bookshelves_ebooks_dict.pkl", 'wb') as fp: pickle.dump(BS_dict, fp) From 8c6b530f0c464be1d5ce1ec9ad3a35c664e48c94 Mon Sep 17 00:00:00 2001 From: Hugo^3 Date: Fri, 5 Jan 2024 01:43:45 +0000 Subject: [PATCH 16/20] Corrected help message for procedures option --- get_data.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/get_data.py b/get_data.py index c39810e..0ebdf8d 100644 --- a/get_data.py +++ b/get_data.py @@ -87,13 +87,13 @@ # rsync command parser.add_argument( "--procedures", - help='''Procedures to go through, defaults to \"pdlmb\": - - [p]ull mirror files - - find [d]uplicates - - hard [l]ink from mirror to raw - - get [m]etadata - - get [b]ookshelf information - - [s]tore bookshelf information''', + help='''Procedures to go through, defaults to \"pdlmbs\": + [p]ull mirror files; + find [d]uplicates; + hard [l]ink from mirror to raw; + get [m]etadata; + get [b]ookshelf information; + [s]tore bookshelf information''', default='pdlmbs', type=str) From b81b321574bf3091e6c43d4d7b3399670273325b Mon Sep 17 00:00:00 2001 From: Hugo Date: Fri, 5 Jan 2024 09:47:18 +0800 Subject: [PATCH 17/20] Update README.md --- README.md | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7bd926f..43a213c 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,13 @@ SPGC-2018-07-18 contains the `tokens/` and `counts/` files of all books that wer For **most other use cases**, however, you probably want the latest, most recent version of the corpus, in which case you should use this repository to **generate the corpus locally** on your computer. In particular, you will need to generate the corpus locally if you need to work with the original full text files in `raw/` and `text/`, since these are not included in the SPGC-2018-07-18 Zenodo dataset. +## Changes in this fork +- Windows support (still need to install `wget` and `cwRsync` (cwRsync tested with 5.4.1) +- Fixed stuffs in original code: + - oversights (bookshelves info are never fetched, nltk missing download, utf-8 decoding error in ebook header, etc.) + - bugs & typos +- Parallelised text processing +- Additional arguments for customization ## Installation :warning: **Python 2.x is not supported** Please make sure your system runs Python 3.x. (https://pythonclock.org/). @@ -43,7 +50,7 @@ python get_data.py This will download a copy of all UTF-8 books in PG and will create a csv file with metadata (e.g. author, title, year, ...). Notice that if you already have some of the data, the program will only download those you are missing (we use `rsync` for this). It is hence easy to update the dataset periodically to keep it up-to-date by just running `get_data.py`. - +> For Windows users, see the [**Usage**](#usage) section ## Processing the data To process all the data in the `raw/` directory, run @@ -51,6 +58,75 @@ To process all the data in the `raw/` directory, run python process_data.py ``` This will fill in the `text/`, `tokens/` and `counts/` folders. +> To avoid losing ebooks that are actually UTF-8 but mistakenly removed in the original code, see the [**Usage**](#usage) section +## Usage +**Recommended usage for `get_data.py` (Windows user):** +```bash +python get_data.py --rsync "cwRsync_5.4.1/rsync" +``` +(replace `cwRsync_5.4.1/rsync` with path to your rsync binary, `.exe` is not needed) + +**Recommended usage for `process_data.py`:** +```bash +python process_data.py --ignore +``` +**How to use `get_data.py` with customisation options:** +``` +python get_data.py --help +usage: Update local PG repository. + +This script will download all books currently not in your +local copy of PG and get the latest version of the metadata. + + [-h] [-m MIRROR] [-r RAW] [-M METADATA] [-p PATTERN] [-k] [-owr] [-q] [-c] [--rsync RSYNC] [--procedures PROCEDURES] + +options: + -h, --help show this help message and exit + -m MIRROR, --mirror MIRROR + Path to the mirror folder that will be updated via rsync. + -r RAW, --raw RAW Path to the raw folder. + -M METADATA, --metadata METADATA + Path to the metadata folder. + -p PATTERN, --pattern PATTERN + Patterns to get only a subset of books. + -k, --keep_rdf If there is an RDF file in metadata dir, do not overwrite it. + -owr, --overwrite_raw + Overwrite files in raw. + -q, --quiet Quiet mode, do not print info, warnings, etc + -c, --clean Clean the mirror directory to remove any empty folders + --rsync RSYNC Specify an alternative rsync command + --procedures PROCEDURES + Procedures to go through, defaults to "pdlmbs": [p]ull mirror files; find [d]uplicates; hard [l]ink from mirror to raw; + get [m]etadata; get [b]ookshelf information; [s]tore bookshelf information +``` +**How to use `process_data.py` with customisation options:** +``` +python process_data.py --help +[nltk_data] Downloading package punkt to src/nltk_data... +[nltk_data] Package punkt is already up-to-date! +usage: Processing raw texts from Project Gutenberg: i) removing headers,ii) tokenizing, and iii) counting words. + [-h] [-r RAW] [-ote OUTPUT_TEXT] [-oto OUTPUT_TOKENS] [-oco OUTPUT_COUNTS] [-p PATTERN] [-q] [-l LOG_FILE] [-c] [--ignore] + [--pool {process,thread}] + +options: + -h, --help show this help message and exit + -r RAW, --raw RAW Path to the raw-folder + -ote OUTPUT_TEXT, --output_text OUTPUT_TEXT + Path to text-output (text_dir) + -oto OUTPUT_TOKENS, --output_tokens OUTPUT_TOKENS + Path to tokens-output (tokens_dir) + -oco OUTPUT_COUNTS, --output_counts OUTPUT_COUNTS + Path to counts-output (counts_dir) + -p PATTERN, --pattern PATTERN + Pattern to specify a subset of books + -q, --quiet Quiet mode, do not print info, warnings, etc + -l LOG_FILE, --log_file LOG_FILE + Path to log file + -c, --check_empty Whether to check if existing files are empty + --ignore Whether to ignore UTF-8 decoding errors + --pool {process,thread} + Whether to use multi-processing or multi-threading +``` From 0349d282989dfe5aa4c552616ac18dd634920db5 Mon Sep 17 00:00:00 2001 From: Hugo Date: Fri, 5 Jan 2024 09:51:59 +0800 Subject: [PATCH 18/20] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 43a213c..e8d8a17 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,8 @@ For **most other use cases**, however, you probably want the latest, most recent - bugs & typos - Parallelised text processing - Additional arguments for customization +> **Note:** +> this fork has only been tested on Windows (yet), but should work on other platforms unless the original code doesn't work in the first place? ## Installation :warning: **Python 2.x is not supported** Please make sure your system runs Python 3.x. (https://pythonclock.org/). @@ -105,8 +107,6 @@ options: **How to use `process_data.py` with customisation options:** ``` python process_data.py --help -[nltk_data] Downloading package punkt to src/nltk_data... -[nltk_data] Package punkt is already up-to-date! usage: Processing raw texts from Project Gutenberg: i) removing headers,ii) tokenizing, and iii) counting words. [-h] [-r RAW] [-ote OUTPUT_TEXT] [-oto OUTPUT_TOKENS] [-oco OUTPUT_COUNTS] [-p PATTERN] [-q] [-l LOG_FILE] [-c] [--ignore] [--pool {process,thread}] From a96bf1141ca804fa02815f8d1e8d913671729c8f Mon Sep 17 00:00:00 2001 From: Hugo Date: Fri, 5 Jan 2024 09:54:22 +0800 Subject: [PATCH 19/20] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e8d8a17..a24be6e 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ For **most other use cases**, however, you probably want the latest, most recent - oversights (bookshelves info are never fetched, nltk missing download, utf-8 decoding error in ebook header, etc.) - bugs & typos - Parallelised text processing -- Additional arguments for customization +- Additional arguments for customisation (see [**Usage**](#usage) section) > **Note:** > this fork has only been tested on Windows (yet), but should work on other platforms unless the original code doesn't work in the first place? @@ -52,7 +52,7 @@ python get_data.py This will download a copy of all UTF-8 books in PG and will create a csv file with metadata (e.g. author, title, year, ...). Notice that if you already have some of the data, the program will only download those you are missing (we use `rsync` for this). It is hence easy to update the dataset periodically to keep it up-to-date by just running `get_data.py`. -> For Windows users, see the [**Usage**](#usage) section +> For Windows users, see [**Usage**](#usage) section ## Processing the data To process all the data in the `raw/` directory, run @@ -60,7 +60,7 @@ To process all the data in the `raw/` directory, run python process_data.py ``` This will fill in the `text/`, `tokens/` and `counts/` folders. -> To avoid losing ebooks that are actually UTF-8 but mistakenly removed in the original code, see the [**Usage**](#usage) section +> To avoid losing ebooks that are actually UTF-8 but mistakenly removed in the original code, see [**Usage**](#usage) section ## Usage **Recommended usage for `get_data.py` (Windows user):** From 0764543b615aeba59af6aa6e9e4dc7eb79b46804 Mon Sep 17 00:00:00 2001 From: Hugo Date: Fri, 5 Jan 2024 10:01:48 +0800 Subject: [PATCH 20/20] Update README.md --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a24be6e..ef32add 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,8 @@ For **most other use cases**, however, you probably want the latest, most recent ## Changes in this fork - Windows support (still need to install `wget` and `cwRsync` (cwRsync tested with 5.4.1) -- Fixed stuffs in original code: +- Patched stuffs in original code: + - unwanted garbage in bookshelves info (probably due to Project Gutenberg website updating) - oversights (bookshelves info are never fetched, nltk missing download, utf-8 decoding error in ebook header, etc.) - bugs & typos - Parallelised text processing @@ -24,6 +25,13 @@ For **most other use cases**, however, you probably want the latest, most recent > **Note:** > this fork has only been tested on Windows (yet), but should work on other platforms unless the original code doesn't work in the first place? +## Todo +- Better tokenisation rules? + - Chinese books are all empty after tokenisation -> use jieba, probably? + - Only tokens that return `True` for `str.isalpha()` are kept currently +- Faster method for getting bookselves info? + + ## Installation :warning: **Python 2.x is not supported** Please make sure your system runs Python 3.x. (https://pythonclock.org/).