-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcrawler.php
More file actions
52 lines (47 loc) · 1.63 KB
/
crawler.php
File metadata and controls
52 lines (47 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
<?php
include "functions.php";
ini_set('max_execution_time', 0);
$configs = include('config.php');
date_default_timezone_set('Europe/Prague');
$conn = connect_to_db($configs["servername"], $configs["dbname"], $configs["username"], $configs["password"]);
$already_crawled = alredy_crawled($conn);
$already_crawled_update = array();
$to_be_crawled = array();
$to_be_crawled[] ="https://www.youtube.com";
$i = 0;
while(isset($to_be_crawled[$i])){
//echo $to_be_crawled[$i]."<br><br>";
//print_r($to_be_crawled);
//echo "<br><br>";
$html = get_html($to_be_crawled[$i]);
if($html != false){
$DOM = new DOMDocument('1.0', 'UTF-8');
$DOM->loadHTML($html);
$tags = get_tags($DOM, $to_be_crawled[$i]);
if((!in_array($to_be_crawled[$i], $already_crawled))){
$already_crawled_update[] = $to_be_crawled[$i];
$already_crawled_update[$to_be_crawled[$i]] = $tags;
$already_crawled[] = $to_be_crawled[$i];
}
//print_r($already_crawled_update);
$links = get_links($DOM);
//print_r($links);
foreach($links as $item){
$fix = fix_url($item, $to_be_crawled[$i]);
if($fix != "" and (!in_array($fix, $to_be_crawled))){
//echo $fix."<br><br>";
$to_be_crawled[] = $fix;
}
}
}
if(count($already_crawled_update) == 10){
update($conn, $already_crawled_update);
$already_crawled_update = array();
}
if($configs["stop"] != 0 and $i == $configs["stop"]){
break;
}
$i++;
}
update($conn, $already_crawled_update);
$already_crawled_update = array();