-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathHolisticDocumentCrawler.php
More file actions
162 lines (137 loc) · 4.93 KB
/
Copy pathHolisticDocumentCrawler.php
File metadata and controls
162 lines (137 loc) · 4.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
<?php
/**
* Bit&Black Document Crawler.
*
* @author Tobias Köngeter
* @copyright Copyright © Bit&Black
* @link https://www.bitandblack.com
* @license MIT
*/
namespace BitAndBlack\DocumentCrawler;
use BitAndBlack\DocumentCrawler\Crawler\AnchorsCrawler;
use BitAndBlack\DocumentCrawler\Crawler\IconsCrawler;
use BitAndBlack\DocumentCrawler\Crawler\ImagesCrawler;
use BitAndBlack\DocumentCrawler\Crawler\LanguageCodeCrawler;
use BitAndBlack\DocumentCrawler\Crawler\MetaTagsCrawler;
use BitAndBlack\DocumentCrawler\Crawler\TitleCrawler;
use BitAndBlack\DocumentCrawler\DTO\Anchor;
use BitAndBlack\DocumentCrawler\DTO\Icon;
use BitAndBlack\DocumentCrawler\DTO\Image;
use BitAndBlack\DocumentCrawler\DTO\LanguageCode;
use BitAndBlack\DocumentCrawler\DTO\MetaTag;
use BitAndBlack\DocumentCrawler\HttpClient\HttpClientInterface;
use BitAndBlack\DocumentCrawler\HttpClient\HttpDiscoveryClient;
use BitAndBlack\DocumentCrawler\ResourceHandler\PassiveResourceHandler;
use BitAndBlack\DocumentCrawler\ResourceHandler\ResourceHandlerInterface;
use BitAndBlack\DocumentCrawler\Util\BaseUrl;
use Symfony\Component\DomCrawler\Crawler;
/**
* This crawler takes a document as a whole and runs
*
* * the {@see IconsCrawler}
* * the {@see ImagesCrawler}
* * the {@see LanguageCodeCrawler}
* * the {@see MetaTagsCrawler}
* * and the {@see TitleCrawler}
*
* Instead of initializing the class with a document, it's also possible to use the
* {@see HolisticDocumentCrawler::createFromUrl()} method and use a URL instead.
*/
readonly class HolisticDocumentCrawler
{
private IconsCrawler $iconsCrawler;
private ImagesCrawler $imagesCrawler;
private LanguageCodeCrawler $languageCodeCrawler;
private MetaTagsCrawler $metaTagsCrawler;
private TitleCrawler $titleCrawler;
private AnchorsCrawler $anchorsCrawler;
/**
* @param string $document The content of an HTML or XML document.
* @param string|null $baseUrl A URL that gets used for every relative URL in the document to make an absolute URL out of it.
* This URL will be converted to a base URL automatically.
* @param ResourceHandlerInterface $resourceHandler
*/
public function __construct(
string $document,
string|null $baseUrl = null,
private ResourceHandlerInterface $resourceHandler = new PassiveResourceHandler(),
) {
if (null !== $baseUrl) {
$baseUrl = (string) new BaseUrl($baseUrl);
}
$crawler = new Crawler($document, $baseUrl);
$this->iconsCrawler = new IconsCrawler($crawler);
$this->iconsCrawler->setResourceHandler($this->resourceHandler);
$this->iconsCrawler->crawlContent();
$this->imagesCrawler = new ImagesCrawler($crawler);
$this->imagesCrawler->setResourceHandler($this->resourceHandler);
$this->imagesCrawler->crawlContent();
$this->languageCodeCrawler = new LanguageCodeCrawler($crawler);
$this->languageCodeCrawler->crawlContent();
$this->metaTagsCrawler = new MetaTagsCrawler($crawler);
$this->metaTagsCrawler->setResourceHandler($this->resourceHandler);
$this->metaTagsCrawler->crawlContent();
$this->titleCrawler = new TitleCrawler($crawler);
$this->titleCrawler->crawlContent();
$this->anchorsCrawler = new AnchorsCrawler($crawler);
$this->anchorsCrawler->crawlContent();
}
/**
* Initialise the class with a URL instead of a document.
* The content will be fetched at first and then crawled as second.
*
* @throws Exception
*/
public static function createFromUrl(
string $url,
ResourceHandlerInterface $resourceHandler = new PassiveResourceHandler(),
HttpClientInterface $httpClient = new HttpDiscoveryClient(),
): self {
$response = $httpClient->requestUrl($url);
$content = $response->getBody()->getContents();
return new self($content, $url, $resourceHandler);
}
public function getResourceHandler(): ResourceHandlerInterface
{
return $this->resourceHandler;
}
/**
* @return array<string, array<int, MetaTag>>
*/
public function getMetaTags(): array
{
return $this->metaTagsCrawler->getMetaTags();
}
/**
* @return array<int, Icon>
*/
public function getIcons(): array
{
return $this->iconsCrawler->getIcons();
}
/**
* @return string|null
*/
public function getTitle(): string|null
{
return $this->titleCrawler->getTitle();
}
/**
* @return array<int, Image>
*/
public function getImages(): array
{
return $this->imagesCrawler->getImages();
}
public function getLanguageCode(): LanguageCode|null
{
return $this->languageCodeCrawler->getLanguageCode();
}
/**
* @return array<int, Anchor>
*/
public function getAnchors(): array
{
return $this->anchorsCrawler->getAnchors();
}
}