This repository was archived by the owner on Feb 9, 2023. It is now read-only.
forked from vladikk/JSON2CSV
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathjson2csv.py
More file actions
148 lines (129 loc) · 5.58 KB
/
json2csv.py
File metadata and controls
148 lines (129 loc) · 5.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from itertools import chain
from collections import OrderedDict
from orderedset import OrderedSet
import json
import csv
import StringIO
import sys
import argparse
global allowedFields
def loadJSON_multipleLines(segments):
#Iteratively reads and appends lines to build a json object.
#It then batches those objects and returns them according to maxRecords.
#This will speed up some processing but also ensures a good sample of records are used to build the csv headers
maxRecords=100
recordSet=[]
chunk = ""
for segment in segments:
chunk += segment
try:
recordSet.append( json.loads(chunk, object_pairs_hook=OrderedDict) )
if len(recordSet)>=maxRecords:
yield recordSet
recordSet=[]
chunk = ""
except ValueError:
pass
#yield recordSet when EOF is reached
yield recordSet
def create_fields_file(fields_file_path, input_file_path):
#clear file first
fields_file_path.seek(0)
fields_file_path.truncate()
with fields_file_path as field_file, input_file_path as input_file:
f=OrderedDict()
for parsed_json in loadJSON_multipleLines(input_file):
for d in parsed_json:
ps_keys=d.keys()
f.update(zip(ps_keys,[True]*len(ps_keys)))
json.dump(f,fields_file_path,indent=0)
def json_to_csv(input_file_path, output_file_path, fields_dict):
global allowedFields
#json = input_file.read()
allowedFields = [k for k,v in fields_dict.items() if v==True]
headers_written=False
for parsed_json in loadJSON_multipleLines(input_file_path):
dicts = json_to_dicts(parsed_json)
#dicts_to_csv(dicts, output_csv)
if headers_written==False:
#keys = set(chain.from_iterable([o.keys() for o in dicts]))
#keys = set()
keys=OrderedSet()
for k in [o.keys() for o in dicts]:
keys.update(k)
output_csv=csv.DictWriter(output_file_path,fieldnames=keys)
output_csv.writeheader()
headers_written=True
output_csv.writerows(dicts)
def json_to_dicts(json_str):
if type(json_str)==str:
objects = json.loads(json_str,object_pairs_hook=OrderedDict)
elif type(json_str) in (dict, OrderedDict):
objects=[json_str]
else:
objects=json_str
#def to_single_dict(lst):
# result = {}
# for d in lst:
# for k in d.keys():
# result[k] = d[k]
# return result;
return [OrderedDict(to_keyvalue_pairs(obj)) for obj in objects]
def to_keyvalue_pairs(source, ancestors=[], key_delimeter='_'):
def is_sequence(arg):
return (not hasattr(arg, "strip") and hasattr(arg, "__getitem__") or hasattr(arg, "__iter__"))
def is_dict(arg):
return hasattr(arg, "keys")
if is_dict(source):
result = [to_keyvalue_pairs(source[key], ancestors + [key]) for key in source.keys() if (ancestors==[] and key in allowedFields) or ancestors!=[]]
return list(chain.from_iterable(result))
elif is_sequence(source):
result = [to_keyvalue_pairs(item, ancestors + [str(index)]) for (index, item) in enumerate(source)]
return list(chain.from_iterable(result))
else:
return [(key_delimeter.join(ancestors), source)]
'''
def dicts_to_csv(source, output_file):
#def build_row(dict_obj, keys):
# return [dict_obj.get(k, "") for k in keys]
keys = sorted(set(chain.from_iterable([o.keys() for o in source])))
#rows = [build_row(d, keys) for d in source]
#cw = csv.writer(output_file)
cw=output_file
#cw.writerow(keys)
#for row in rows:
#cw.writerow([c.encode('utf-8') if isinstance(c, str) or isinstance(c, unicode) else c for c in row])
cw.writerows(source)
'''
#def write_csv(headers, rows, file):
# cw = csv.writer(file)
# cw.writerow(headers)
# for row in rows:
# cw.writerow([c.encode('utf-8') if isinstance(c, str) or isinstance(c, unicode) else c for c in row])
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--fields",type=argparse.FileType('a+'),default='fields.json',help="Field list file for output. If not given, a fields.json file will be created allowing all fields.")
parser.add_argument("-o", "--output",type=argparse.FileType('w'), help="Destination file. To generate a fields.json only, leave blank.")
parser.add_argument("JSON_FILE_PATH",type=argparse.FileType('r'), help="JSON log file to parse")
args=parser.parse_args()
if args.output==None:
create_fields_file(args.fields, args.JSON_FILE_PATH)
else:
try:
fields_dict=json.load(args.fields, object_pairs_hook=OrderedDict)
json_to_csv(args.JSON_FILE_PATH, args.output, fields_dict)
except ValueError: #Fields file is empty, invalid, or did not exist
i=raw_input("Invalid fields file specified. Create default fields.json to continue (Y/N)? ")
if i.upper()=='Y':
args.fields.truncate()
create_fields_file(args.fields, args.JSON_FILE_PATH)
#files must be reopened since they are closed after the with statement in create_fields_file
fields_dict=json.load(open(args.fields.name), object_pairs_hook=OrderedDict)
json_to_csv(open(args.JSON_FILE_PATH.name), args.output, fields_dict)
else:
quit()
# json_to_csv(args[1], args[2])
# print 'Finished'
#else:
#print 'Usage:'
#print 'python json2csv.py "{JSON_FILE_PATH}" "{OUTPUT_FILE_PATH}"'