pyscript-pages-test/founders.py at main · pythonpaul/pyscript-pages-test · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
# Standard library imports
import re
import sys
from datetime import datetime
import shutil

# Third-party imports
import pandas as pd
import usaddress
from usaddress import RepeatedLabelError
from nameparser import HumanName

sys.path.insert(0, 'python_files')
# Local imports
import dynamic_tsv

from io import StringIO
from js import document, window

def parse_line(line):
    print(line)
    input_string = re.sub(r'\s+', ' ', line)

    # simplify regex patterns to get everything from the left most index of the string until the matching end signal pattern
    regex_patterns = [r'^([A-Z]{4}\d{6})',
                    r'([A-Za-z\s.\'\&\-]+)',
                    # 7155 W 91ST ST APT 1W BRIDGEVIEW IL604550000000007085398253 02/09/2406/24/24AP3743 HARLEM INSURANCE AGENCY INC. 7089239500000138.33FI
                    r'^(.*?)\s[A-Z]{2}\d{5}',
                    # phone number
                    r'0{9}(\d{10})',
                    # date1
                    r'(\d{2}/\d{2}/\d{2})',
                    # date2
                    r'(\d{2}/\d{2}/\d{2})',
                    # insurance company name
                    r"([A-Za-z\s.,'&:/\d-]+)\s(?=\d+\.\d{2}[A-Z]{2})",
                    # balance A15
                    r'^(\d{10})',
                    ]
    data = []

    for pattern in regex_patterns:

        # if pattern index == 6 (insurance company name), drop the characters before the first space character
        if regex_patterns.index(pattern) == 6:
            input_string = input_string.split(' ', 1)[1]
            #print(input_string)
            #sys.exit()
        # print(input_string)
        match = re.search(pattern, input_string)
        if match:
            # print(match)
            # print(match.group(0))
            data.append(match.group(0))

        else:
            print(input_string)
            index_=regex_patterns.index(pattern)
            print(index_)
            data.append("None")

        # remove leading and trailing spaces from the input string
        if match is not None:
            input_string = input_string.replace(match.group(0), '', 1).strip()
        # input_string = input_string.replace(match.group(0), '', 1).strip()
        # print(input_string)

    result_dict = {
        'A11': data[0],
        'A2.1': data[1],
        # separate the two letter state code and zip code OH44310 > OH 44310
        'A3.1': data[2],
        # todo: strip leading 0's
        'A8': data[3],
        # policy effective date (CHARGE DATE) (consumer columns: 285, 286, 287)
        'A12': data[4],
        # chargeoff date
        'F285': data[5],
        'M1': 'INS AGT: '+data[6],
        'M2': data[7],
        'A15': input_string # remaining string
    }

    return result_dict

def reformat_A15(value):
    # Remove 'FI' postfix
    value = str(value)
    value = value.replace('FI', '')
    # Remove leading zeros
    value = value.lstrip('0')
    return value

def reformat_names(value):
    name = HumanName(value)
    formatted_name = f"{name.last}, {name.suffix} {name.title} {name.first} {name.middle}"
    # remove extra spaces in formatted name
    formatted_name = re.sub(r'\s+', ' ', formatted_name).strip()
    return formatted_name

# reformat A8 values to remove leading zeros
def reformat_A8(value):
    return value.lstrip('0')

def reformat_address(value):
    try:
        # addr_dict = {}
        # add a space at index -6
        value = value[: -5] + " " + value[-5:]
        print(value)
        return ' '.join(usaddress.tag(value)[0].values())
    except usaddress.RepeatedLabelError as e:
        return value

    # # street address
    # # everything before placename
    # addr_dict['A3.1'] = value[:usaddress.tag(value)[0]['PlaceName']]
    # # city
    # addr_dict['A4'] = usaddress.tag(value)[0]['PlaceName']
    # # state
    # addr_dict['A5'] = usaddress.tag(value)[0]['StateName']
    # # zip code
    # addr_dict['A6'] = usaddress.tag(value)[0]['ZipCode']

# return addr_dict
# print(' '.join(usaddress.tag(input_string)[0].values()))
def city(value):
    try:
        # city
        return usaddress.tag(value)[0]['PlaceName']
    except usaddress.RepeatedLabelError as e:
        return value.split()[-3]
    except KeyError:
        return None

def state(value):
    try:
        # state
        return usaddress.tag(value)[0]['StateName']
    except usaddress.RepeatedLabelError as e:
        return value.split()[-2]

def zip_code(value):
    try:
        # zip code
        return usaddress.tag(value)[0]['ZipCode']
    except usaddress.RepeatedLabelError as e:
        return value.split()[-1]

# remove everything from placename onward
def final_address(value):
    try:
        # from the index where placename starts, remove everything after
        value = value[:value.find(usaddress.tag(value)[0]['PlaceName'])]
        return value.strip()
    except usaddress.RepeatedLabelError:
        return ' '.join(value.split()[:-3])
    except KeyError:
        return None
import os
def loop():
    final_dict = []
    directory = 'raw_data/FOUNDERS/'

    # replace with any file name named Placement
    placement_file_names = [os.path.join(directory, f) for f in os.listdir(directory) if f.startswith('Placement')]

    # Create a new directory named after today's date in the format dd-mm-yy
    today_date = datetime.now().strftime('%m-%d-%y')
    new_directory = os.path.join(directory, today_date)
    os.makedirs(new_directory, exist_ok=True)


    for file_name in placement_file_names:
        with open(f'{file_name}', 'r') as file:
            for line in file:
                print(line)
                final_dict.append(parse_line(line))
        df = pd.DataFrame(final_dict)
        df['A15'] = df['A15'].apply(reformat_A15)
        df = df[df['A15'].astype(float) >= 50.00]
        df['A2.1'] = df['A2.1'].apply(reformat_names)
        df['A8'] = df['A8'].apply(reformat_A8)
        # format address first time
        df['A3.1'] = df['A3.1'].apply(reformat_address)
        df['A4'] = df['A3.1'].apply(city)
        df['A5'] = df['A3.1'].apply(state)
        df['A6'] = df['A3.1'].apply(zip_code)
        # format address second time
        df['A3.1'] = df['A3.1'].apply(final_address)
        df['F286'] = 'CHARGE OFF DATE'

        # F287: principal amount, 99% of time. maybe interest barren, or fees.
        # if that's case, populate F288 (interest), F289 (fees)
        # F290 subsequent charges
        df['F287'] = df['A15']
        df['F288'] = 0.0
        df['F289'] = 0.0
        df['F290'] = 0.0

        # put A15 after A12
        df.insert(loc=3, column='A4', value=df.pop('A4'))
        df.insert(loc=4, column='A5', value=df.pop('A5'))
        df.insert(loc=5, column='A6', value=df.pop('A6'))
        df.insert(loc=8, column='A15', value=df.pop('A15'))
        df.insert(loc=9, column='F286', value=df.pop('F286'))
        #address dataframe: merge at end
        # addr_df = df.apply(lambda x: reformat_address(x['A3.1']), result_type='expand')
        df = df.rename(columns={'M1': 'M', 'M2': 'M'})
        df.to_csv(f'{file_name}.csv', index=False)
        df = pd.DataFrame()
        final_dict = []

     # Move the placement files into the new directory

    placement_file_names = [name + '.csv' for name in placement_file_names]

    dynamic_tsv.dynamic_tsv(placement_file_names)

    for file_name in placement_file_names:
        original_file_name = file_name.replace('.csv', '')
        # just get the base tsv file name
        tsv_file_name = file_name.replace('.csv', '.tsv')

        # # Move the files to the new dated directory
        # # original raw file
        # shutil.move(original_file_name, new_directory)
        # # csv file
        # shutil.move(file_name, new_directory)
        # # tsv file
        # shutil.move(tsv_file_name, new_directory)

        # # variable for renaming to FOUNDERS.tsv
        # tsv_file_base_name = os.path.basename(tsv_file_name)
        # os.rename(os.path.join(new_directory, tsv_file_base_name), os.path.join(new_directory, 'FOUNDERS.tsv'))

        # Create a Blob and trigger download
        blob = window.Blob.new([csv_content], {"type": "text/csv"})
        url = window.URL.createObjectURL(blob)
        link = document.createElement("a")
        link.href = url
        link.download = "data.csv"
        document.body.appendChild(link)
        link.click()
        document.body.removeChild(link)
        window.URL.revokeObjectURL(url)


        # # Rename the tsv file to FOUNDERS.tsv based on Anitha's request

loop()

# import datetime as dtv

# from scheduler import Scheduler
# from scheduler.trigger import Monday, Tuesday, Wednesday, Thursday, Friday

# def foo():
#     print("foo")

# schedule = Scheduler()

# schedule.daily(dt.time(hour=10), Monday, Tuesday, Wednesday, Thursday, Friday, loop)

# print(schedule)
# import time

# while True:
#     schedule.exec_jobs()
#     time.sleep(1)

# input_string = "EZOH108809BRITTANY MICHELLE WUCHICH                                   1070 BERWIN ST                AKRON                         OH443100000000003303385397          05/24/2406/24/24AP7395    TRUECUT INSURANCE LLC         3304141150000034.79FI"
# parse_line(input_string)