Python/aprs_local.py at main · CharlieEvert/Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import time
import os
import re
import csv
import PyPDF2
import tiktoken
from concurrent.futures import ThreadPoolExecutor

num_cpu_cores = os.cpu_count()
folder_path = "/Volumes/My Passport for Mac/aprs/First_Load"

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def save_to_csv(embeddings, output_file):
    with open(output_file, 'a', newline='', encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["doctype", "name", "page", "tokens", "text"])
        writer.writerows(embeddings)

    with open(output_file, "r", newline="", encoding="utf-8") as f:
        reader = csv.reader(f)
        row_count = sum(1 for row in reader) - 1
        print(f"Current length of CSV: {row_count} rows")


def process_pdf(file_path, counter, start_time, output_file):
    local_embeddings = []
    name = os.path.basename(file_path)
    try:
        if name.endswith(".pdf"):
            with open(file_path, "rb") as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page_num in range(len(pdf_reader.pages)):
                    text = pdf_reader.pages[page_num].extract_text()
                    text = re.sub(r"^\s+|\s+?$", "", text)
                    text = re.sub(" +", " ", text).strip()
                    tokens = num_tokens_from_string(text, "cl100k_base")
                    embedding = {
                        "doctype": "apr",
                        "name": name,
                        "page": page_num,
                        "tokens": tokens,
                        "text": text
                    }
                    local_embeddings.append(embedding)

                    if len(local_embeddings) >= 1000:
                        save_to_csv(local_embeddings, output_file)
                        local_embeddings = []

        print(f"Successfully processed file: {counter}, Filename: {name}")
    except Exception as e:
        print(f"Failed to process file: {counter}, Filename: {name}, Error: {str(e)}")
    finally:
        elapsed_time = (time.time() - start_time) / 3600
        print(f"Processed file: {counter}, Elapsed time: {elapsed_time:.2f} hours")

csv_file = "/Users/cevert/Desktop/ai_projects/aprs/aprs.csv"  # Full path to aprs.csv

# Always create a new aprs.csv file, overwriting the old one.
with open(csv_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["doctype", "name", "page", "tokens", "text"])
    writer.writeheader()

pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".pdf")]
start_time = time.time()

with ThreadPoolExecutor(max_workers=int(num_cpu_cores)) as executor:
    executor.map(process_pdf, pdf_files, range(len(pdf_files)), [start_time] * len(pdf_files), [csv_file] * len(pdf_files))

print("Processing completed.")