$dbHost, 'port' => $dbPort, 'dbname' => $dbName, ]; $dsnParts = array_filter($dsnParts, fn ($value) => ! empty($value)); $dsnParts = array_map(function ($key, $value) { return "$key=$value"; }, array_keys($dsnParts), $dsnParts); $dsn = "$dbType:" . ($dbType == 'sqlite' ? $dbPath : implode(';', $dsnParts)); global $db; $db = new PDO( $dsn, $dbUsername, $dbPassword, [ PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC, ], ); if (isset($_GET['init--db'])) { $db->query(<<execute(); } /* **** * Crawler **** */ /* * crawl * max-pages */ if (! empty($_GET['crawl'])) { $targetToCrawl = $_GET['crawl']; $targetUrl = parse_url($targetToCrawl); $baseUrl = "{$targetUrl['scheme']}://{$targetUrl['host']}"; $streamContext = stream_context_create([ 'http' => [ 'follow_location' => true, ], ]); $crawledPagesMax = intval($_GET['max-pages'] ?? 10); $crawledPagesCount = 0; $responseCode = intval(explode(' ', get_headers("$baseUrl/sitemap.xml")[0])[1]); if ($responseCode == 200) { Logger::log('Found sitemap.xml'); $dom = new DOMDocument(); @$dom->loadHTML(file_get_contents("$baseUrl/sitemap.xml", false, $streamContext)); $xpath = new DomXPath($dom); /**@var DOMNodeList $locations*/ $locations = $xpath->query('//loc'); foreach ($locations as $location) { /*@var DOMNode $location*/ $locationUrl = $location->textContent; if (str_starts_with($locationUrl, '//')) { $locationUrl = "{$targetUrl['scheme']}:" . $locationUrl; } index_page($locationUrl); $crawledPagesCount++; if ($crawledPagesCount >= $crawledPagesMax) { break; } } } else { index_page($targetToCrawl); $dom = new DOMDocument(); @$dom->loadHTML(file_get_contents($targetToCrawl, false, $streamContext)); $xpath = new DomXPath($dom); $links = $xpath->query('//a'); foreach ($links as $link) { /*@var DOMNode $link*/ $href = $link->attributes->getNamedItem('href')->nodeValue; $fullPath = str_starts_with($href, 'http') ? $href : "$baseUrl$href"; $linkUrl = parse_url($fullPath); if ($linkUrl['host'] != $targetUrl['host']) { continue; } index_page($fullPath); } } // TODO: check sitemap first, only check those links then // TODO: otherwise get every link on site and crawl that // TODO: check if link (without query params?) already visited for recursive protection // strip html => store text in db } function index_page(string $url): bool { global $db; $url = trim($url, "/ \n\r\t\v\0"); Logger::log("Indexing $url"); $streamContext = stream_context_create([ 'http' => [ 'follow_location' => true, ], ]); $html = file_get_contents($url, false, $streamContext); $dom = new DOMDocument(); @$dom->loadHTML($html); $xpath = new DomXPath($dom); // $content = strip_tags($content); $content = $html; // Excerpt /**@var DOMNodeList $excerpt*/ $excerpt = $xpath->query('//meta[@name="description"]'); if ($excerpt->count() > 0) { $excerpt = $excerpt->item(0)->attributes->getNamedItem('content')->textContent ?? ''; } else { $excerpt = ''; } if (empty($excerpt)) { $excerpt = $xpath->query('//p'); if ($excerpt->count() > 0) { $excerpt = $excerpt->item(0)->textContent; $excerpt = trim($excerpt); } else { $excerpt = ''; } } // Title /**@var DOMNodeList $title*/ $title = $xpath->query('//title'); if ($title->count() > 0) { $title = $title->item(0)->textContent; $title = trim($title); } else { $title = $url; } // Domain $parsedUrl = parse_url($url); $domain = $parsedUrl['host']; // Insert $db->prepare(<<execute([ 'url' => $url, ]); return $db->prepare(<<execute([ 'url' => $url, 'domain' => $domain, 'content' => $content, 'excerpt' => $excerpt, 'title' => $title, 'timestamp' => time(), ]); } /* **** * Search **** */ /** * search * domain */ if (isset($_GET['search'])) { ?>
= 2) { ?>
    $a) { ?>


"%$query%", 'limit' => $limit, 'offset' => ($page - 1) * $limit, ...$mappingsDomain, ]; $statement = $db->prepare($sql); $statement->execute($mappings); $rows = $statement->fetchAll(); $results = []; foreach ($rows as $row) { $results[$row['domain']][] = $row; } return [ 'results' => $results, ]; } function form_query_fields() { $query = array_diff_key($_GET, array_flip(['page'])); foreach ($query as $key => $value) { ?> prepare(<<execute(['search' => "%{$_GET['api_search']}%"]); $result = $statement->fetchAll(); header('Content-Type: application/json'); $json = []; foreach ($result as $row) { $excerpt = $row['excerpt']; if (empty($excerpt)) { $excerpt = "EXCERPT"; } $json[] = [ 'url' => $row['url'], 'title' => $row['title'], 'excerpt' => $row['excerpt'], ]; } echo json_encode($json); } } /* **** * Utilities **** */ class Logger { static function log(string $message): void { echo nl2br($message . "\n"); } }