From a583624503c620fbea48bd5e197b4bd53d64fbb0 Mon Sep 17 00:00:00 2001 From: Alireza Rezaei <84126554+RezaeiAlireza@users.noreply.github.com> Date: Sun, 8 Jun 2025 13:56:11 +0200 Subject: [PATCH] fixed DS_Store, FLAGS, lxml Added FLAGS(sys.argv) to initialize the flags. Added .DS_Store to .gitignore and excluded in pack_data.py --- .gitignore | 3 +++ markuplm/examples/fine_tuning/run_swde/pack_data.py | 4 ++++ markuplm/examples/fine_tuning/run_swde/prepare_data.py | 3 ++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 05353e4e7..b94441370 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,9 @@ __pycache__/ # C extensions *.so +# Ignore Mac DS_Store +.DS_Store + # Distribution / packaging .Python build/ diff --git a/markuplm/examples/fine_tuning/run_swde/pack_data.py b/markuplm/examples/fine_tuning/run_swde/pack_data.py index 66550d4ee..b0b89ccb0 100644 --- a/markuplm/examples/fine_tuning/run_swde/pack_data.py +++ b/markuplm/examples/fine_tuning/run_swde/pack_data.py @@ -46,6 +46,7 @@ flags.DEFINE_integer("first_n_pages", -1, "The cut-off number to shorten the number of pages.") +FLAGS(sys.argv) # unknown flag error will be raised if the flags are not set correctly. def pack_swde_data(swde_path, pack_path, cut_off): """Packs the swde dataset to a single file. @@ -77,6 +78,9 @@ def pack_swde_data(swde_path, pack_path, cut_off): print("Start loading data...") for v in vertical_to_websites_map: for w in os.listdir(os.path.join(swde_path, v)): + # if is .DS_Store continue, o.w. causes an error in mac + if ".DS_Store" in w: + continue page_count = 0 filenames = os.listdir(os.path.join(swde_path, v, w)) filenames.sort() diff --git a/markuplm/examples/fine_tuning/run_swde/prepare_data.py b/markuplm/examples/fine_tuning/run_swde/prepare_data.py index c73b8e830..17f3921d1 100644 --- a/markuplm/examples/fine_tuning/run_swde/prepare_data.py +++ b/markuplm/examples/fine_tuning/run_swde/prepare_data.py @@ -31,7 +31,7 @@ from absl import flags import lxml from lxml import etree -from lxml.html.clean import Cleaner +from lxml_html_clean import Cleaner from tqdm import tqdm import constants @@ -51,6 +51,7 @@ "The path of the output file containing both the input sequences and " "output sequences of the sequence tagging version of swde dataset.") +FLAGS(sys.argv) # unknown flag error will be raised if the flags are not set correctly. def clean_spaces(text): r"""Clean extra spaces in a string.