-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerateTranscripts.py
More file actions
executable file
·63 lines (48 loc) · 1.55 KB
/
generateTranscripts.py
File metadata and controls
executable file
·63 lines (48 loc) · 1.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env python
"""
Python script to extract all of the transcripts from the Spongebob fandom website.
"""
from bs4 import BeautifulSoup
import urllib
import re
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
def main():
#Set up the web browser
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
driver.get('https://spongebob.fandom.com/wiki/List_of_transcripts')
html_page = driver.page_source
soup = BeautifulSoup(html_page, features="lxml")
linksList = []
transcriptLinks = []
#Get all transcript links from the page
for link in soup(href = True):
linksList.append(link.get('href'))
for link in linksList:
if "transcript" in link and not("transcripts" in link):
transcriptLinks.append(link)
print (transcriptLinks)
#Extract speech transcript portion from each transcript link and compile into a txt file
separator = "--------------------\n"
with open('compiled_transcripts.txt', 'w') as f:
for transcriptURL in transcriptLinks:
url = "https://spongebob.fandom.com" + transcriptURL
episode_title = transcriptURL.split('/')[2]
print ( url)
print (episode_title)
driver.get(url)
page = driver.page_source
soup = BeautifulSoup(page, 'lxml')
div = soup.find(id="mw-content-text").find_all('ul')[0]
items = [item.text for item in div.select('li')]
#Display episode title header
f.write(separator)
f.write(episode_title + "\n")
f.write(separator)
for item in items:
f.write(item)
driver.quit()
if __name__ = "__main__"
main()