-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathHTMLParser.py
More file actions
151 lines (120 loc) · 5.36 KB
/
HTMLParser.py
File metadata and controls
151 lines (120 loc) · 5.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# https://stackoverflow.com/questions/6325216/parse-html-table-to-python-list
# https://pbpython.com/pandas-html-table.html
import json
import pandas as pd
import numpy as np
import requests
import json
import base64
from unicodedata import normalize
from tkinter import *
URL = "https://inet-olgr.justice.qld.gov.au/plugins/viewstorage/viewpagestorage.action?pageId=48334101"
SUCCESS_CODE = 200
# For debug purposes
USE_JAMES_CREDENTIALS = False
# To retrieve new ca cert:
# View certificate from website and in firefox download "PEM (chain)"
INET_CERT = "certs/inet-olgr-justice-qld-gov-au-chain.pem"
# input_file is a JSON file format that needs to be formatted as a dict
# to support mailto script.
HTML_fname = 'storage_format.html'
JSON_FNAME = 'storage_format.json'
class HTMLParser:
def __init__(self):
self.credentials = ''
self.parsed_website = False
if USE_JAMES_CREDENTIALS:
self.parsed_website = self.downloadHTML(URL)
if self.parsed_website:
self.WriteDatatoFile(JSON_FNAME, self.readHTML(HTML_fname))
else:
print("error parsing: " + url)
else:
self.promptForCredentials() # display GUI
def promptForCredentials(self):
self.root = Tk()
self.root.title("HTMLParser")
width = 400
height = 250
screen_width = self.root.winfo_screenwidth()
screen_height = self.root.winfo_screenheight()
x = (screen_width/2) - (width/2)
y = (screen_height/2) - (height/2)
self.root.geometry("%dx%d+%d+%d" % (width, height, x, y))
self.root.resizable(0, 0)
#==============================VARIABLES======================================
self.username = StringVar()
self.password = StringVar()
#==============================FRAMES=========================================
Top = Frame(self.root, bd=2, relief=RIDGE)
Top.pack(side=TOP, fill=X)
Form = Frame(self.root)
Form.pack(side=TOP, pady=20)
#==============================LABELS=========================================
lbl_title = Label(Top, text = "HTMLParser: Login to iNET", font=('arial', 15))
lbl_title.pack(fill=X)
lbl_username = Label(Form, text = "Username:", font=('arial', 14), bd=15)
lbl_username.grid(row=0, sticky="e")
lbl_password = Label(Form, text = "Password:", font=('arial', 14), bd=15)
lbl_password.grid(row=1, sticky="e")
self.lbl_text = Label(Form)
self.lbl_text.grid(row=2, columnspan=2)
#==============================ENTRY WIDGETS==================================
username = Entry(Form, textvariable=self.username, font=(14))
username.grid(row=0, column=1)
password = Entry(Form, textvariable=self.password, show="*", font=(14))
password.grid(row=1, column=1)
#==============================BUTTON WIDGETS=================================
btn_login = Button(Form, text="Login", width=30, command=self.Login)
btn_login.grid(pady=10, row=3, columnspan=2)
btn_login.bind('<Return>', lambda event=None: btn_login.invoke()) #self.Login)
self.root.mainloop()
def Login(self, event=None):
if self.username.get() == "" or self.password.get() == "":
self.lbl_text.config(text="Please complete the required field!", fg="red")
else:
self.root.withdraw() # hide
auth_str = self.generate_auth_str(self.username.get(), self.password.get())
if self.downloadHTML(URL, auth_str):
self.parsed_website = True
self.WriteDatatoFile(JSON_FNAME, self.readHTML(HTML_fname))
else:
self.parsed_website = False
self.root.quit() # destroy
def generate_auth_str(self, login, password):
auth_str = login + ":" + password
# need to encode Authentication string to base64
encodedBytes = base64.b64encode(auth_str.encode("utf-8"))
encodedStr = str(encodedBytes, "utf-8")
return encodedStr
def downloadHTML(self, url, auth_str='YWNlcmV0anI6OTNudVl3aER4W1Uo'):
# auth_str: if its good enough for Tabcorp, its good enough for me
headers = {
"Content-Type": "application/json",
"Authorization": "Basic " + auth_str
}
response = requests.request(
"GET",
url,
headers=headers,
verify=INET_CERT,
)
if response.status_code == SUCCESS_CODE:
print("Server Response: ", response.status_code, " success! retrieved data from: ", response.url, )
self.WriteDatatoFile(HTML_fname, response.text)
return True
else:
print("Server Response: ", response.status_code, " failure! reason:", response.reason)
return False
def readHTML(self, fname):
table_read = pd.read_html(fname, match='Email Groups')
print(f'Total tables: {len(table_read)}')
self.df = table_read[0]
# print(self.df.info())
return self.df.to_json(orient='records')
def WriteDatatoFile(self, fname, data):
with open(fname, 'w+') as f:
f.write(data)
def main():
app = HTMLParser()
if __name__ == "__main__": main()