squidward-chatbot/generateTranscripts.py at master · CacheCoww/squidward-chatbot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env python

"""

Python script to extract all of the transcripts from the Spongebob fandom website.

"""

from bs4 import BeautifulSoup
import urllib
import re
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

def main():
	#Set up the web browser
	options = Options()
	options.headless = True

	driver = webdriver.Firefox(options=options)
	driver.get('https://spongebob.fandom.com/wiki/List_of_transcripts')
	html_page = driver.page_source
	soup = BeautifulSoup(html_page, features="lxml")

	linksList = []
	transcriptLinks = []

	#Get all transcript links from the page
	for link in soup(href = True):
		linksList.append(link.get('href'))

	for link in linksList:
		if "transcript" in link and not("transcripts" in link):
			transcriptLinks.append(link)
	print (transcriptLinks)


	#Extract speech transcript portion from each transcript link and compile into a txt file
	separator = "--------------------\n"
	with open('compiled_transcripts.txt', 'w') as f:
		for transcriptURL in transcriptLinks:
			url = "https://spongebob.fandom.com" + transcriptURL
			episode_title = transcriptURL.split('/')[2]
			print ( url)
			print (episode_title)
			driver.get(url)
			page = driver.page_source

			soup = BeautifulSoup(page, 'lxml')
			div = soup.find(id="mw-content-text").find_all('ul')[0]
			items = [item.text for item in div.select('li')]

			#Display episode title header
			f.write(separator)
			f.write(episode_title + "\n")
			f.write(separator)
			for item in items:
				f.write(item)

	driver.quit()

if __name__ = "__main__"
	main()