From 624d7c9ba5b28a3eeef99af7277bf3bacc817709 Mon Sep 17 00:00:00 2001 From: Daniel Weipert Date: Thu, 17 Oct 2024 13:17:25 +0200 Subject: better crawling, search frontend and search json api --- index.php | 222 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 207 insertions(+), 15 deletions(-) (limited to 'index.php') diff --git a/index.php b/index.php index c9bb48c..e23fa19 100644 --- a/index.php +++ b/index.php @@ -69,9 +69,34 @@ if (! empty($_GET['crawl'])) { ], ]); + $crawledPagesMax = intval($_GET['max-pages'] ?? 10); + $crawledPagesCount = 0; + $responseCode = intval(explode(' ', get_headers("$baseUrl/sitemap.xml")[0])[1]); if ($responseCode == 200) { + $dom = new DOMDocument(); + @$dom->loadHTML(file_get_contents("$baseUrl/sitemap.xml", false, $streamContext)); + $xpath = new DomXPath($dom); + + /**@var DOMNodeList $locations*/ + $locations = $xpath->query('//loc'); + foreach ($locations as $location) { + /*@var DOMNode $location*/ + $locationUrl = $location->textContent; + if (str_starts_with($locationUrl, '//')) { + $locationUrl = "{$targetUrl['scheme']}:" . $locationUrl; + } + + index_page($locationUrl); + + $crawledPagesCount++; + if ($crawledPagesCount >= $crawledPagesMax) { + break; + } + } } else { + index_page($targetToCrawl); + $dom = new DOMDocument(); @$dom->loadHTML(file_get_contents($targetToCrawl, false, $streamContext)); $xpath = new DomXPath($dom); @@ -87,19 +112,7 @@ if (! empty($_GET['crawl'])) { continue; } - $content = file_get_contents($fullPath, false, $streamContext); - $content = strip_tags($content); - - $db->prepare(<<execute([ - 'url' => $fullPath, - 'content' => $content, - 'excerpt' => 'EXCERPT', - 'title' => 'TITLE', - 'timestamp' => time(), - ]); + index_page($fullPath); } } @@ -109,19 +122,123 @@ if (! empty($_GET['crawl'])) { // strip html => store text in db } +function index_page(string $url): bool { + global $db; + + $url = trim($url, "/ \n\r\t\v\0"); + + Logger::log("Indexing $url"); + + $streamContext = stream_context_create([ + 'http' => [ + 'follow_location' => true, + ], + ]); + + $html = file_get_contents($url, false, $streamContext); + $dom = new DOMDocument(); + @$dom->loadHTML($html); + $xpath = new DomXPath($dom); + + // $content = strip_tags($content); + $content = $html; + + + // Excerpt + + /**@var DOMNodeList $excerpt*/ + $excerpt = $xpath->query('//meta[@name="description"]'); + if ($excerpt->count() > 0) { + $excerpt = $excerpt->item(0)->attributes->getNamedItem('content')->textContent ?? ''; + } else { + $excerpt = ''; + } + + if (empty($excerpt)) { + $excerpt = $xpath->query('//p'); + if ($excerpt->count() > 0) { + $excerpt = $excerpt->item(0)->textContent; + $excerpt = trim($excerpt); + } else { + $excerpt = ''; + } + } + + + // Title + + /**@var DOMNodeList $title*/ + $title = $xpath->query('//title'); + if ($title->count() > 0) { + $title = $title->item(0)->textContent; + $title = trim($title); + } else { + $title = $url; + } + + + // Insert + + $db->prepare(<<execute([ + 'url' => $url, + ]); + + return $db->prepare(<<execute([ + 'url' => $url, + 'content' => $content, + 'excerpt' => $excerpt, + 'title' => $title, + 'timestamp' => time(), + ]); +} + +/* + **** + * Search + **** + */ + if (isset($_GET['search'])) { + ?> + + + +
+
+ + +
+ prepare(<<execute(['search' => "%{$_GET['search']}%"]); + limit :limit offset :offset + SQL); + $statement->execute([ + 'search' => "%{$_GET['search']}%", + 'limit' => 10, + 'offset' => (($_GET['page'] ?? 1) - 1) * 10 + ]); $result = $statement->fetchAll(); foreach ($result as $row) { + $excerpt = $row['excerpt']; + if (empty($excerpt)) { + $excerpt = "EXCERPT"; + } + ?> +

@@ -129,5 +246,80 @@ if (isset($_GET['search'])) {
+
+ $value): ?> + + + + +
+ +
+ + + prepare(<<execute(['search' => "%{$_GET['api_search']}%"]); + $result = $statement->fetchAll(); + + header('Content-Type: application/json'); + + $json = []; + foreach ($result as $row) { + $excerpt = $row['excerpt']; + if (empty($excerpt)) { + $excerpt = "EXCERPT"; + } + + $json[] = [ + 'url' => $row['url'], + 'title' => $row['title'], + 'excerpt' => $row['excerpt'], + ]; + } + + echo json_encode($json); + } +} + + + +/* + **** + * Utilities + **** + */ + +class Logger { + static function log(string $message): void { + echo nl2br($message . "\n"); } } -- cgit v1.2.3