Python-Script-For-PDF-Data-Processing/process_dictionary.py at main · seanwillex/Python-Script-For-PDF-Data-Processing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import re
import pandas as pd
from pdfminer.high_level import extract_text
import unicodedata
import json

def read_pdf_file(file_path):
    # Extract text from page 16 onwards
    text = extract_text(file_path, page_numbers=range(15, 1000))  # Assuming max 1000 pages
    # Normalize the Unicode characters
    text = unicodedata.normalize('NFKD', text)
    return text

def clean_text(text):
    # Remove any remaining non-printable characters
    return ''.join(char for char in text if char.isprintable() or char.isspace())

def parse_dictionary(content):
    entries = []
    lines = content.split('\n')
    current_entry = {}

    for line in lines:
        line = clean_text(line.strip())
        if not line:
            continue

        # Check if this line starts a new entry
        match = re.match(r'^(\S+)\s+(\S+\.)\s+(\S+)\s+(.*)', line)
        if match:
            if current_entry:
                entries.append(current_entry)
            kirike, pos, sf, english = match.groups()
            current_entry = {
                'Kịrịkẹ': kirike,
                'Part of Speech': pos,
                'Semantic Field': sf,
                'English Gloss': english
            }
        elif current_entry:
            # If not a new entry, append to the current English gloss
            current_entry['English Gloss'] += ' ' + line

    # Add the last entry
    if current_entry:
        entries.append(current_entry)

    return entries

def save_to_csv(entries, output_file):
    df = pd.DataFrame(entries)
    df.to_csv(output_file, index=False, encoding='utf-8-sig')

def save_to_json(entries, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(entries, f, ensure_ascii=False, indent=2)

def save_to_markdown(entries, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("# Kịrịkẹ-English Dictionary\n\n")
        for entry in entries:
            f.write(f"## {entry['Kịrịkẹ']}\n\n")
            f.write(f"**Part of Speech:** {entry['Part of Speech']}\n\n")
            f.write(f"**Semantic Field:** {entry['Semantic Field']}\n\n")
            f.write(f"**English Gloss:** {entry['English Gloss']}\n\n")
            f.write("---\n\n")

if __name__ == "__main__":
    input_file = "kirike-english-dictionary.pdf"  # Replace with your PDF file name
    output_csv = "kirike-english-dictionary.csv"
    output_json = "kirike-english-dictionary.json"
    output_md = "kirike-english-dictionary.md"

    print("Reading PDF file...")
    content = read_pdf_file(input_file)

    print("Parsing dictionary entries...")
    entries = parse_dictionary(content)

    print("Saving to CSV...")
    save_to_csv(entries, output_csv)

    print("Saving to JSON...")
    save_to_json(entries, output_json)

    print("Saving to Markdown...")
    save_to_markdown(entries, output_md)

    print(f"Processed {len(entries)} entries.")
    print(f"CSV file saved: {output_csv}")
    print(f"JSON file saved: {output_json}")
    print(f"Markdown file saved: {output_md}")