Crawler/crawl.php at master · FCC/Crawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
<?php
/**
 *
 * Main crawler function.  Can be run manually or via cron.
 *
 * Append "?debug=true" to URL for verbose output, but performance will degrade over time (as browser buffer fills)
 *
 */


/**
 * Call necesary include files
 */
include('config.php');
include('includes/functions.php');
include('includes/mysql_functions.php');

/**
 * Parse domain list into an array
 */
$domain_array = explode(',',$domains);
?>
<html>
	<body>
	<?php
	echo "<p>STARTED: <b>" . date('Y-m-d H:i:s') . "</b></p>";
	echo "<p>Domains: <b>$domains</b></p>";
	echo "<p>crawl_tag: <b>$crawl_tag</b></p>";
	echo "<p>database: <b>$mysql_db</b></p>";
	echo "<p><b>Crawling...</b></p>";

	/*
	 * Grab list of uncrawled URLs, repeat while there are still URLs to crawl
	 */
	while ($urls = uncrawled_urls($crawl_tag)) {

		/**
		 * Loop through the array of uncrawled URLs
		 */
		foreach ($urls as $id=>$url_data) {

			/**
			 * If we're in debug mode, indicate that we are begining to crawl a new URL
			 */
			if (isset($_GET['debug']))
				echo "<p style='font-weight:bold'>Starting to crawl " . urldecode($url_data['url']) . "</p><ul>";

			/**
			 * If this is a seed URL, set clicks to zero,
			 * otherwise, increment our internal click counter one beyond the parent's clicks
			 */
			if (!isset($url_data['clicks'])) $clicks = 0;
			else $clicks = $url_data['clicks'] + 1;

			/**
			 * Curl the page, returning data as an array
			 */
			$page_data = curl_page($url_data['url']);

			/**
			 * Calculate the directory of the current page, used to parse relative URLs
			 */
			$dir = parse_dir($url_data['url']);

			/**
			 * Parse the title of the current page
			 */
			$title = parse_title($page_data['html']);

			/**
			 * Parse the HTML for links, store in an array
			 */
			$links = parse_links($page_data['html']);

			/**
			 * Loop through the array of links
			 */
			foreach ($links as $key => &$link) {
				/**
				 * Uniformly clean the link so we don't have duplicates (absolute, no anchors, add www., etc.)
				 */
				$link = clean_link($link, $dir);

				/**
				 * If the link is to an image, do not add it
				 */
				if (is_image($link)) continue;

				/**
				 * Verify that the link target is within our array of domains
				 */
				if (out_of_domain($link)) continue;

				/**
				 * Verify that the link target is not excluded by a string match
				 */
				if (exclude_by_pattern($link)) continue;

				/**
				 * Verify that the link is not a mailto: link
				 */
				if (is_mailto($link)) continue;

				/**
				 * Check to see if the URL is already in the table, if so, grab its ID number
				 */
				$to = have_url($link,$crawl_tag);

				/**
				 * If the link is not in the table, add it
				 */
				if (!$to) {
					/**
					 * Output that we're adding a URL if we're in verbose mode
					 */
					if (isset($_GET['debug']))
						echo "<li>Adding url " . urldecode($link) . " to list</li>";

					/**
					 * Add URL to table, grab link ID #
					 */
					$to = add_url($link,$clicks,$crawl_tag);
				}

				/**
				 * If debug mode, indicate that we're adding a link
				 */
				if (isset($_GET['debug']))
					echo "<li>Adding link from here to " . urldecode($link) . "</li>";

				/**
				 * Add the link to the links table
				 */
				add_link($id,$to);
			}

			/**
			 * If the server did not report a size (in which case cURL returns '-1'),
			 * use the size of the cURL as the file size, otherwise, trust the server
			 */
			if ($page_data['reported_size'] != -1) $size = $page_data['reported_size'];
			else $size = $page_data['actual_size'];

			/**
			 * If the server returned a modifed header, trust it, otherwise (return of '-1' from cURL) NULL the string.
			 */
			if ($page_data['modified'] != -1) $modified = $page_data['modified'];
			else $modified = NULL;

			/**
			 * Format the Data array
			 */
			$data = array(	'crawled'=>1,
							'title'=>$title,
							'http_code' => $page_data['http_code'],
							'size' => $size,
							'type' => $page_data['type'],
							'modified' => $modified,
							'md5' => $page_data['md5'],
							'crawl_tag' => $crawl_tag,
							'html' => NULL
							);

			/**
			 * If config is set to store local version of file, store it
			 */
			if($store_local) {

				// Split text/html; charset=UTF-8
				$type_info = explode("; ", $page_data['type']);

				// Only store 'text/html' files
				// TO DO enable range of file types to save
				if($type_info[0] == 'text/html' ) {
					$data['html'] = $page_data['html'];
				}
			}


			/**
			 *  Store data
			 */
			mysql_update('urls',$data,array('ID'=>$id));


			/**
			 * If in debug mode, close the <ul> we opened above
			 */
			if (isset($_GET['debug']))
				echo "</ul>";

		} //End foreach URL

	} //End While uncrawled URLs

	/**
	 * If we're done, let the user know the good news
	 */
	if (sizeof($urls) == 0)	echo "<p>No URLs to crawl!</p>";
	echo "<p>FINISHED: " . date('Y-m-d H:i:s') . "</p>";
	?>
	</body>
<html>