diff options
author | Daniel Weipert <git@mail.dweipert.de> | 2024-10-17 13:17:25 +0200 |
---|---|---|
committer | Daniel Weipert <git@mail.dweipert.de> | 2024-10-17 13:17:25 +0200 |
commit | 624d7c9ba5b28a3eeef99af7277bf3bacc817709 (patch) | |
tree | fbefc8d1aa3754d352232f20ff4d2ef2b956afe4 /index.php | |
parent | ef1e70b5d73fec98d8b3be96c9a776ff36566d2c (diff) |
better crawling, search frontend and search json api
Diffstat (limited to 'index.php')
-rw-r--r-- | index.php | 222 |
1 files changed, 207 insertions, 15 deletions
@@ -69,9 +69,34 @@ if (! empty($_GET['crawl'])) { ], ]); + $crawledPagesMax = intval($_GET['max-pages'] ?? 10); + $crawledPagesCount = 0; + $responseCode = intval(explode(' ', get_headers("$baseUrl/sitemap.xml")[0])[1]); if ($responseCode == 200) { + $dom = new DOMDocument(); + @$dom->loadHTML(file_get_contents("$baseUrl/sitemap.xml", false, $streamContext)); + $xpath = new DomXPath($dom); + + /**@var DOMNodeList $locations*/ + $locations = $xpath->query('//loc'); + foreach ($locations as $location) { + /*@var DOMNode $location*/ + $locationUrl = $location->textContent; + if (str_starts_with($locationUrl, '//')) { + $locationUrl = "{$targetUrl['scheme']}:" . $locationUrl; + } + + index_page($locationUrl); + + $crawledPagesCount++; + if ($crawledPagesCount >= $crawledPagesMax) { + break; + } + } } else { + index_page($targetToCrawl); + $dom = new DOMDocument(); @$dom->loadHTML(file_get_contents($targetToCrawl, false, $streamContext)); $xpath = new DomXPath($dom); @@ -87,19 +112,7 @@ if (! empty($_GET['crawl'])) { continue; } - $content = file_get_contents($fullPath, false, $streamContext); - $content = strip_tags($content); - - $db->prepare(<<<SQL - insert into search_index (url, content, excerpt, title, timestamp) values (:url, :content, :excerpt, :title, :timestamp) - SQL) - ->execute([ - 'url' => $fullPath, - 'content' => $content, - 'excerpt' => 'EXCERPT', - 'title' => 'TITLE', - 'timestamp' => time(), - ]); + index_page($fullPath); } } @@ -109,19 +122,123 @@ if (! empty($_GET['crawl'])) { // strip html => store text in db } +function index_page(string $url): bool { + global $db; + + $url = trim($url, "/ \n\r\t\v\0"); + + Logger::log("Indexing $url"); + + $streamContext = stream_context_create([ + 'http' => [ + 'follow_location' => true, + ], + ]); + + $html = file_get_contents($url, false, $streamContext); + $dom = new DOMDocument(); + @$dom->loadHTML($html); + $xpath = new DomXPath($dom); + + // $content = strip_tags($content); + $content = $html; + + + // Excerpt + + /**@var DOMNodeList $excerpt*/ + $excerpt = $xpath->query('//meta[@name="description"]'); + if ($excerpt->count() > 0) { + $excerpt = $excerpt->item(0)->attributes->getNamedItem('content')->textContent ?? ''; + } else { + $excerpt = ''; + } + + if (empty($excerpt)) { + $excerpt = $xpath->query('//p'); + if ($excerpt->count() > 0) { + $excerpt = $excerpt->item(0)->textContent; + $excerpt = trim($excerpt); + } else { + $excerpt = ''; + } + } + + + // Title + + /**@var DOMNodeList $title*/ + $title = $xpath->query('//title'); + if ($title->count() > 0) { + $title = $title->item(0)->textContent; + $title = trim($title); + } else { + $title = $url; + } + + + // Insert + + $db->prepare(<<<SQL + delete from search_index where url=:url + SQL) + ->execute([ + 'url' => $url, + ]); + + return $db->prepare(<<<SQL + insert into search_index (url, content, excerpt, title, timestamp) values (:url, :content, :excerpt, :title, :timestamp) + SQL) + ->execute([ + 'url' => $url, + 'content' => $content, + 'excerpt' => $excerpt, + 'title' => $title, + 'timestamp' => time(), + ]); +} + +/* + **** + * Search + **** + */ + if (isset($_GET['search'])) { + ?> + <!DOCTYPE html> + <html> + <body> + <div style="max-width: 1200px; margin: 0 auto;"> + <form> + <input type="search" name="search" value="<?php echo $_GET['search'] ?? ''; ?>"> + <input type="submit" value="Search"> + </form> + <?php + if (! empty($_GET['search'])) { $statement = $db->prepare(<<<SQL select * from search_index where content like :search - SQL); - $statement->execute(['search' => "%{$_GET['search']}%"]); + limit :limit offset :offset + SQL); + $statement->execute([ + 'search' => "%{$_GET['search']}%", + 'limit' => 10, + 'offset' => (($_GET['page'] ?? 1) - 1) * 10 + ]); $result = $statement->fetchAll(); foreach ($result as $row) { + $excerpt = $row['excerpt']; + if (empty($excerpt)) { + $excerpt = "EXCERPT"; + } + ?> + <hr> <div> <a href="<?php echo $row['url']; ?>"><?php echo $row['title']; ?></a> <p><?php echo $row['excerpt']; ?></p> @@ -129,5 +246,80 @@ if (isset($_GET['search'])) { </div> <?php } + + $query = array_diff_key($_GET, array_flip(['page'])); + ?> + <form> + <?php foreach ($query as $key => $value): ?> + <input type="hidden" name="<?php echo $key; ?>" value="<?php echo $value; ?>"> + <?php endforeach; ?> + <button name="page" value="<?php echo intval($_GET['page'] ?? 1) - 1; ?>">Previous</button> + <button name="page" value="<?php echo intval($_GET['page'] ?? 1) + 1; ?>">Next</button> + </form> + <?php + } + + ?> + </div> + </body> + </html> + <?php +} + + + +/* + **** + * API + **** + */ + +if (isset($_GET['api'])) { + + /* + **** + * API Search + **** + */ + + if (! empty($_GET['api_search'])) { + $statement = $db->prepare(<<<SQL + select * from search_index + where content like :search + SQL); + $statement->execute(['search' => "%{$_GET['api_search']}%"]); + $result = $statement->fetchAll(); + + header('Content-Type: application/json'); + + $json = []; + foreach ($result as $row) { + $excerpt = $row['excerpt']; + if (empty($excerpt)) { + $excerpt = "EXCERPT"; + } + + $json[] = [ + 'url' => $row['url'], + 'title' => $row['title'], + 'excerpt' => $row['excerpt'], + ]; + } + + echo json_encode($json); + } +} + + + +/* + **** + * Utilities + **** + */ + +class Logger { + static function log(string $message): void { + echo nl2br($message . "\n"); } } |