summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--index.php222
1 files changed, 207 insertions, 15 deletions
diff --git a/index.php b/index.php
index c9bb48c..e23fa19 100644
--- a/index.php
+++ b/index.php
@@ -69,9 +69,34 @@ if (! empty($_GET['crawl'])) {
],
]);
+ $crawledPagesMax = intval($_GET['max-pages'] ?? 10);
+ $crawledPagesCount = 0;
+
$responseCode = intval(explode(' ', get_headers("$baseUrl/sitemap.xml")[0])[1]);
if ($responseCode == 200) {
+ $dom = new DOMDocument();
+ @$dom->loadHTML(file_get_contents("$baseUrl/sitemap.xml", false, $streamContext));
+ $xpath = new DomXPath($dom);
+
+ /**@var DOMNodeList $locations*/
+ $locations = $xpath->query('//loc');
+ foreach ($locations as $location) {
+ /*@var DOMNode $location*/
+ $locationUrl = $location->textContent;
+ if (str_starts_with($locationUrl, '//')) {
+ $locationUrl = "{$targetUrl['scheme']}:" . $locationUrl;
+ }
+
+ index_page($locationUrl);
+
+ $crawledPagesCount++;
+ if ($crawledPagesCount >= $crawledPagesMax) {
+ break;
+ }
+ }
} else {
+ index_page($targetToCrawl);
+
$dom = new DOMDocument();
@$dom->loadHTML(file_get_contents($targetToCrawl, false, $streamContext));
$xpath = new DomXPath($dom);
@@ -87,19 +112,7 @@ if (! empty($_GET['crawl'])) {
continue;
}
- $content = file_get_contents($fullPath, false, $streamContext);
- $content = strip_tags($content);
-
- $db->prepare(<<<SQL
- insert into search_index (url, content, excerpt, title, timestamp) values (:url, :content, :excerpt, :title, :timestamp)
- SQL)
- ->execute([
- 'url' => $fullPath,
- 'content' => $content,
- 'excerpt' => 'EXCERPT',
- 'title' => 'TITLE',
- 'timestamp' => time(),
- ]);
+ index_page($fullPath);
}
}
@@ -109,19 +122,123 @@ if (! empty($_GET['crawl'])) {
// strip html => store text in db
}
+function index_page(string $url): bool {
+ global $db;
+
+ $url = trim($url, "/ \n\r\t\v\0");
+
+ Logger::log("Indexing $url");
+
+ $streamContext = stream_context_create([
+ 'http' => [
+ 'follow_location' => true,
+ ],
+ ]);
+
+ $html = file_get_contents($url, false, $streamContext);
+ $dom = new DOMDocument();
+ @$dom->loadHTML($html);
+ $xpath = new DomXPath($dom);
+
+ // $content = strip_tags($content);
+ $content = $html;
+
+
+ // Excerpt
+
+ /**@var DOMNodeList $excerpt*/
+ $excerpt = $xpath->query('//meta[@name="description"]');
+ if ($excerpt->count() > 0) {
+ $excerpt = $excerpt->item(0)->attributes->getNamedItem('content')->textContent ?? '';
+ } else {
+ $excerpt = '';
+ }
+
+ if (empty($excerpt)) {
+ $excerpt = $xpath->query('//p');
+ if ($excerpt->count() > 0) {
+ $excerpt = $excerpt->item(0)->textContent;
+ $excerpt = trim($excerpt);
+ } else {
+ $excerpt = '';
+ }
+ }
+
+
+ // Title
+
+ /**@var DOMNodeList $title*/
+ $title = $xpath->query('//title');
+ if ($title->count() > 0) {
+ $title = $title->item(0)->textContent;
+ $title = trim($title);
+ } else {
+ $title = $url;
+ }
+
+
+ // Insert
+
+ $db->prepare(<<<SQL
+ delete from search_index where url=:url
+ SQL)
+ ->execute([
+ 'url' => $url,
+ ]);
+
+ return $db->prepare(<<<SQL
+ insert into search_index (url, content, excerpt, title, timestamp) values (:url, :content, :excerpt, :title, :timestamp)
+ SQL)
+ ->execute([
+ 'url' => $url,
+ 'content' => $content,
+ 'excerpt' => $excerpt,
+ 'title' => $title,
+ 'timestamp' => time(),
+ ]);
+}
+
+/*
+ ****
+ * Search
+ ****
+ */
+
if (isset($_GET['search'])) {
+ ?>
+ <!DOCTYPE html>
+ <html>
+ <body>
+ <div style="max-width: 1200px; margin: 0 auto;">
+ <form>
+ <input type="search" name="search" value="<?php echo $_GET['search'] ?? ''; ?>">
+ <input type="submit" value="Search">
+ </form>
+ <?php
+
if (! empty($_GET['search'])) {
$statement = $db->prepare(<<<SQL
select * from search_index
where content like :search
- SQL);
- $statement->execute(['search' => "%{$_GET['search']}%"]);
+ limit :limit offset :offset
+ SQL);
+ $statement->execute([
+ 'search' => "%{$_GET['search']}%",
+ 'limit' => 10,
+ 'offset' => (($_GET['page'] ?? 1) - 1) * 10
+ ]);
$result = $statement->fetchAll();
foreach ($result as $row) {
+ $excerpt = $row['excerpt'];
+ if (empty($excerpt)) {
+ $excerpt = "EXCERPT";
+ }
+
?>
+ <hr>
<div>
<a href="<?php echo $row['url']; ?>"><?php echo $row['title']; ?></a>
<p><?php echo $row['excerpt']; ?></p>
@@ -129,5 +246,80 @@ if (isset($_GET['search'])) {
</div>
<?php
}
+
+ $query = array_diff_key($_GET, array_flip(['page']));
+ ?>
+ <form>
+ <?php foreach ($query as $key => $value): ?>
+ <input type="hidden" name="<?php echo $key; ?>" value="<?php echo $value; ?>">
+ <?php endforeach; ?>
+ <button name="page" value="<?php echo intval($_GET['page'] ?? 1) - 1; ?>">Previous</button>
+ <button name="page" value="<?php echo intval($_GET['page'] ?? 1) + 1; ?>">Next</button>
+ </form>
+ <?php
+ }
+
+ ?>
+ </div>
+ </body>
+ </html>
+ <?php
+}
+
+
+
+/*
+ ****
+ * API
+ ****
+ */
+
+if (isset($_GET['api'])) {
+
+ /*
+ ****
+ * API Search
+ ****
+ */
+
+ if (! empty($_GET['api_search'])) {
+ $statement = $db->prepare(<<<SQL
+ select * from search_index
+ where content like :search
+ SQL);
+ $statement->execute(['search' => "%{$_GET['api_search']}%"]);
+ $result = $statement->fetchAll();
+
+ header('Content-Type: application/json');
+
+ $json = [];
+ foreach ($result as $row) {
+ $excerpt = $row['excerpt'];
+ if (empty($excerpt)) {
+ $excerpt = "EXCERPT";
+ }
+
+ $json[] = [
+ 'url' => $row['url'],
+ 'title' => $row['title'],
+ 'excerpt' => $row['excerpt'],
+ ];
+ }
+
+ echo json_encode($json);
+ }
+}
+
+
+
+/*
+ ****
+ * Utilities
+ ****
+ */
+
+class Logger {
+ static function log(string $message): void {
+ echo nl2br($message . "\n");
}
}