1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
|
<?php
/*
****
* DB
****
*/
$dbType = $_ENV['DB_TYPE'] ?? 'sqlite';
$dbPath = $_ENV['DB_PATH'] ?? './searcher.sqlite';
$dbHost = $_ENV['DB_HOST'] ?? '';
$dbPort = $_ENV['DB_PORT'] ?? '';
$dbName = $_ENV['DB_NAME'] ?? '';
$dbUsername = $_ENV['DB_USERNAME'] ?? '';
$dbPassword = $_ENV['DB_PASSWORD'] ?? '';
$dsnParts = [
'host' => $dbHost,
'port' => $dbPort,
'dbname' => $dbName,
];
$dsnParts = array_filter($dsnParts, fn ($value) => ! empty($value));
$dsnParts = array_map(function ($key, $value) {
return "$key=$value";
}, array_keys($dsnParts), $dsnParts);
$dsn = "$dbType:" . ($dbType == 'sqlite' ? $dbPath : implode(';', $dsnParts));
global $db;
$db = new PDO(
$dsn,
$dbUsername,
$dbPassword,
[
PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC,
],
);
if (isset($_GET['init--db'])) {
$db->query(<<<SQL
create table if not exists search_index (
id integer primary key,
url text unqiue,
content text,
excerpt text,
title text,
timestamp integer
);
SQL)
->execute();
}
/*
****
* Crawler
****
*/
if (! empty($_GET['crawl'])) {
$targetToCrawl = $_GET['crawl'];
$targetUrl = parse_url($targetToCrawl);
$baseUrl = "{$targetUrl['scheme']}://{$targetUrl['host']}";
$streamContext = stream_context_create([
'http' => [
'follow_location' => true,
],
]);
$responseCode = intval(explode(' ', get_headers("$baseUrl/sitemap.xml")[0])[1]);
if ($responseCode == 200) {
} else {
$dom = new DOMDocument();
@$dom->loadHTML(file_get_contents($targetToCrawl, false, $streamContext));
$xpath = new DomXPath($dom);
$links = $xpath->query('//a');
foreach ($links as $link) {
/*@var DOMNode $link*/
$href = $link->attributes->getNamedItem('href')->nodeValue;
$fullPath = str_starts_with($href, 'http') ? $href : "$baseUrl$href";
$linkUrl = parse_url($fullPath);
if ($linkUrl['host'] != $targetUrl['host']) {
continue;
}
$content = file_get_contents($fullPath, false, $streamContext);
$content = strip_tags($content);
$db->prepare(<<<SQL
insert into search_index (url, content, excerpt, title, timestamp) values (:url, :content, :excerpt, :title, :timestamp)
SQL)
->execute([
'url' => $fullPath,
'content' => $content,
'excerpt' => 'EXCERPT',
'title' => 'TITLE',
'timestamp' => time(),
]);
}
}
// TODO: check sitemap first, only check those links then
// TODO: otherwise get every link on site and crawl that
// TODO: check if link (without query params?) already visited for recursive protection
// strip html => store text in db
}
if (isset($_GET['search'])) {
if (! empty($_GET['search'])) {
$statement = $db->prepare(<<<SQL
select * from search_index
where content like :search
SQL);
$statement->execute(['search' => "%{$_GET['search']}%"]);
$result = $statement->fetchAll();
foreach ($result as $row) {
?>
<div>
<a href="<?php echo $row['url']; ?>"><?php echo $row['title']; ?></a>
<p><?php echo $row['excerpt']; ?></p>
<a href="<?php echo $row['url']; ?>"><?php echo $row['url']; ?></a>
</div>
<?php
}
}
}
|