-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_data.py
More file actions
143 lines (128 loc) · 4.35 KB
/
extract_data.py
File metadata and controls
143 lines (128 loc) · 4.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
MongoDB data extraction for training.
Extracts suppliers, quotes, and companies data.
"""
from pymongo import MongoClient
import json
# MongoDB connection
MONGO_URI = "mongodb://trypromptlab:trypromptlabadmin!@localhost:27018/promptlab_production?authSource=admin"
DB_NAME = "promptlab_production"
def connect_db():
"""Connect to MongoDB."""
client = MongoClient(MONGO_URI)
db = client[DB_NAME]
return db
def extract_suppliers(db):
"""Extract supplier data - only with valid data."""
suppliers = list(db.suppliers.find(
{
"companyName": {"$exists": True, "$ne": None, "$ne": ""},
"country": {"$exists": True, "$ne": None, "$ne": ""}
},
{
"companyName": 1,
"contactName": 1,
"email": 1,
"phone": 1,
"country": 1,
"city": 1,
"region": 1,
"main_business_category": 1,
"website_url": 1,
}
))
print(f"Extracted {len(suppliers)} suppliers (with valid data)")
return suppliers
def extract_companies(db):
"""Extract searched companies data - only with valid data."""
companies = list(db.searched_companies.find(
{
"company_name": {"$exists": True, "$ne": None, "$ne": ""},
"short_description": {"$exists": True, "$ne": None, "$ne": ""},
"main_address.country": {"$exists": True, "$ne": None, "$ne": ""}
},
{
"company_name": 1,
"short_description": 1,
"long_description_extracted": 1,
"business_tags_generated": 1,
"main_address": 1,
"year_founded": 1,
"employee_count": 1,
"revenue": 1,
"main_business_category": 1,
"main_industry": 1,
"website_url": 1,
"primary_email": 1,
"primary_phone": 1,
}
))
print(f"Extracted {len(companies)} companies (with valid data)")
return companies
def extract_quotes(db):
"""Extract sourcing quotes data - only with valid price/moq/leadTime."""
quotes = list(db.sourcing_quotes.find(
{
"price": {"$exists": True, "$ne": None, "$gt": 0},
"moq": {"$exists": True, "$ne": None, "$gt": 0},
"leadTime": {"$exists": True, "$ne": None, "$ne": ""}
},
{
"companyId": 1,
"price": 1,
"currency": 1,
"moq": 1,
"leadTime": 1,
}
))
print(f"Extracted {len(quotes)} quotes (with valid price/moq/leadTime)")
return quotes
def extract_procurement_requests(db):
"""Extract procurement requests with product info."""
requests = list(db.procurement_requests.find({}, {
"productItems": 1,
"reqId": 1,
}))
print(f"Extracted {len(requests)} procurement requests")
return requests
def clean_data(data):
"""Clean ObjectId and other non-serializable types."""
if isinstance(data, list):
return [clean_data(item) for item in data]
elif isinstance(data, dict):
cleaned = {}
for key, value in data.items():
if key == "_id":
cleaned[key] = str(value)
elif hasattr(value, '__str__') and 'ObjectId' in str(type(value)):
cleaned[key] = str(value)
else:
cleaned[key] = clean_data(value)
return cleaned
else:
return data
def save_json(data, filename):
"""Save data to JSON file."""
cleaned = clean_data(data)
with open(filename, 'w', encoding='utf-8') as f:
json.dump(cleaned, f, ensure_ascii=False, indent=2, default=str)
print(f"Saved to {filename}")
def main():
print("Connecting to MongoDB...")
db = connect_db()
print("\nExtracting data...")
# Extract all data
suppliers = extract_suppliers(db)
companies = extract_companies(db)
quotes = extract_quotes(db)
requests = extract_procurement_requests(db)
# Save to JSON files
print("\nSaving data...")
save_json(suppliers, "data_suppliers.json")
save_json(companies, "data_companies.json")
save_json(quotes, "data_quotes.json")
save_json(requests, "data_requests.json")
print("\nDone! Data extracted successfully.")
print(f"Total records: {len(suppliers) + len(companies) + len(quotes) + len(requests)}")
if __name__ == "__main__":
main()