forked from bartdegoede/python-searchengine
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload.py
More file actions
21 lines (17 loc) · 649 Bytes
/
load.py
File metadata and controls
21 lines (17 loc) · 649 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import gzip
from lxml import etree
import time
from search.documents import Abstract
def load_documents():
start = time.time()
with gzip.open('data/enwiki-latest-abstract.xml.gz', 'rb') as f:
doc_id = 0
for _, element in etree.iterparse(f, events=('end',), tag='doc'):
title = element.findtext('./title')
url = element.findtext('./url')
abstract = element.findtext('./abstract')
yield Abstract(ID=doc_id, title=title, url=url, abstract=abstract)
doc_id += 1
element.clear()
end = time.time()
print(f'Parsing XML took {end - start} seconds')