-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtable_version1.py
More file actions
62 lines (59 loc) · 1.48 KB
/
table_version1.py
File metadata and controls
62 lines (59 loc) · 1.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from typing import Any
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import bs4
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "https://mcs.ciena.com/InetReports/AssemblyHistory/ResultsByAO.asp?SN=NNTMRT112ND8&PN=NTK540BC-820&R=008"
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
type(soup)
bs4.BeautifulSoup
title = soup.title
print(title)
#text = soup.get_text()
#print(text)
rows = soup.find_all('tr')
#print(rows[7:10])
for row in rows[4:]:
row_td = row.find_all('td')
#print(row_td)
type(row_td)
str_cells = str(row_td)
cleantext = BeautifulSoup(str_cells, "lxml").get_text()
print(cleantext)
import re
list_rows = []
for row in rows[4:]:
cells = row.find_all('td')
str_cells = str(cells[0:12])
clean = re.compile('<.*?>')
clean2 = (re.sub(clean, '',str_cells))
list_rows.append(clean2)
print(clean2)
type(clean2)
df = pd.DataFrame(list_rows)
df.head(10)
df1 = df[0].str.split(',', expand=True)
df1.head(10)
df1[0] = df1[0].str.strip('[')
df1.head(10)
col_labels = soup.find_all('th')
all_header = []
col_str = str(col_labels[0:12])
cleantext2 = BeautifulSoup(col_str, "lxml").get_text()
all_header.append(cleantext2)
print(all_header)
df2 = pd.DataFrame(all_header)
df2.head(10)
df3 = df2[0].str.split(',', expand=True)
df3.head(10)
frames = [df3, df1]
df4 = pd.concat(frames)
df4.head(20)
df5 = df4.rename(columns=df4.iloc[0])
df5.head(20)
df6 = df5.drop(df5.index[0])
df6.head(20)