-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcsvSplitBetter.py
More file actions
109 lines (98 loc) · 4.51 KB
/
csvSplitBetter.py
File metadata and controls
109 lines (98 loc) · 4.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import csv
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.strict = False
self.convert_charrefs= True
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
def removeSubstrings(inFile='hcsv.csv', outfile='aaa.csv'):
aaa=["""This e-mail transmission, and any documents, files or previous e-mail messages attached to it may contain confidential information that is legally privileged. If you are not the intended recipient, or a person responsible for delivering it to the intended recipient, you are hereby notified that any disclosure, copying, distribution or use of any of the information contained in or attached to this transmission is STRICTLY PROHIBITED. If you have received this transmission in error, please immediately notify the sender. Please destroy the original transmission and its attachments without reading or saving in any manner.""","\n\n"]
filecounter=0
linecounter=0
a=open(inFile,'rb')
reader=csv.DictReader(a)
columns=reader.fieldnames
# create the output file
outfileName='%i'%(filecounter) + outfile
b=open(outfileName,'wb')
writer=csv.DictWriter(b,reader.fieldnames)
writer.writeheader()
for eachline in reader:
for eachCol in columns:
if not (len(eachline[eachCol].replace(' ','\n'))>0):
eachline[eachCol]="0"
if(len(eachline[eachCol])>30000):
for eachBadData in aaa:
eachline[eachCol]=eachline[eachCol].replace(eachBadData,'')
eachline[eachCol] = strip_tags(eachline[eachCol])
writer.writerow(eachline)
linecounter+=1
if linecounter>30000:
b.close()
filecounter+=1
outfileName='%i'%(filecounter) + outfile
b=open(outfileName,'wb')
writer=csv.DictWriter(b,reader.fieldnames)
writer.writeheader()
linecounter=0
a.close()
def splitFile(inFile='hcsv.csv', outfile='aaa.csv'):
aaa=["""This e-mail transmission, and any documents, files or previous e-mail messages attached to it may contain confidential information that is legally privileged. If you are not the intended recipient, or a person responsible for delivering it to the intended recipient, you are hereby notified that any disclosure, copying, distribution or use of any of the information contained in or attached to this transmission is STRICTLY PROHIBITED. If you have received this transmission in error, please immediately notify the sender. Please destroy the original transmission and its attachments without reading or saving in any manner.""","\n\n"]
filecounter=0
linecounter=0
a=open(inFile,'rb')
reader=csv.DictReader(a)
columns=reader.fieldnames
# create the output file
outfileName='%i'%(filecounter) + outfile
b=open(outfileName,'wb')
writer=csv.DictWriter(b,reader.fieldnames)
writer.writeheader()
for eachline in reader:
for eachCol in columns:
if not (len(eachline[eachCol].replace(' ',''))>0):
eachline[eachCol]="0"
for eachBadData in aaa:
eachline[eachCol]=eachline[eachCol].replace(eachBadData,'')
writer.writerow(eachline)
linecounter+=1
if linecounter>30000:
b.close()
filecounter+=1
outfileName='%i'%(filecounter) + outfile
b=open(outfileName,'wb')
writer=csv.DictWriter(b,reader.fieldnames)
writer.writeheader()
linecounter=0
a.close()
def fineLine(inFile='hcsv.csv', linenumber):
filecounter=0
linecounter=0
a=open(inFile,'rb')
reader=csv.DictReader(a)
columns=reader.fieldnames
# create the output file
# outfileName='%i'%(filecounter) + outfile
# b=open(outfileName,'wb')
# writer=csv.DictWriter(b,reader.fieldnames)
# writer.writeheader()
for eachline in reader:
if None in eachline.keys():
eachline.pop(None,None)
for eachCol in columns:
if not (len(eachline[eachCol].replace(' ',''))>0):
eachline[eachCol]="0"
# writer.writerow(eachline)
linecounter+=1
if linecounter==linenumber:
return eachline
a.close()