From ef1e70b5d73fec98d8b3be96c9a776ff36566d2c Mon Sep 17 00:00:00 2001 From: Daniel Weipert Date: Fri, 19 Jul 2024 14:07:32 +0200 Subject: initial commit --- .gitignore | 1 + index.php | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 .gitignore create mode 100644 index.php diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9b1dffd --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.sqlite diff --git a/index.php b/index.php new file mode 100644 index 0000000..c9bb48c --- /dev/null +++ b/index.php @@ -0,0 +1,133 @@ + $dbHost, + 'port' => $dbPort, + 'dbname' => $dbName, +]; +$dsnParts = array_filter($dsnParts, fn ($value) => ! empty($value)); +$dsnParts = array_map(function ($key, $value) { + return "$key=$value"; +}, array_keys($dsnParts), $dsnParts); + +$dsn = "$dbType:" . ($dbType == 'sqlite' ? $dbPath : implode(';', $dsnParts)); + +global $db; +$db = new PDO( + $dsn, + $dbUsername, + $dbPassword, + [ + PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC, + ], +); + +if (isset($_GET['init--db'])) { + $db->query(<<execute(); +} + + + +/* + **** + * Crawler + **** + */ + +if (! empty($_GET['crawl'])) { + $targetToCrawl = $_GET['crawl']; + $targetUrl = parse_url($targetToCrawl); + $baseUrl = "{$targetUrl['scheme']}://{$targetUrl['host']}"; + + $streamContext = stream_context_create([ + 'http' => [ + 'follow_location' => true, + ], + ]); + + $responseCode = intval(explode(' ', get_headers("$baseUrl/sitemap.xml")[0])[1]); + if ($responseCode == 200) { + } else { + $dom = new DOMDocument(); + @$dom->loadHTML(file_get_contents($targetToCrawl, false, $streamContext)); + $xpath = new DomXPath($dom); + + $links = $xpath->query('//a'); + foreach ($links as $link) { + /*@var DOMNode $link*/ + $href = $link->attributes->getNamedItem('href')->nodeValue; + $fullPath = str_starts_with($href, 'http') ? $href : "$baseUrl$href"; + + $linkUrl = parse_url($fullPath); + if ($linkUrl['host'] != $targetUrl['host']) { + continue; + } + + $content = file_get_contents($fullPath, false, $streamContext); + $content = strip_tags($content); + + $db->prepare(<<execute([ + 'url' => $fullPath, + 'content' => $content, + 'excerpt' => 'EXCERPT', + 'title' => 'TITLE', + 'timestamp' => time(), + ]); + } + } + + // TODO: check sitemap first, only check those links then + // TODO: otherwise get every link on site and crawl that + // TODO: check if link (without query params?) already visited for recursive protection + // strip html => store text in db +} + + + +if (isset($_GET['search'])) { + if (! empty($_GET['search'])) { + $statement = $db->prepare(<<execute(['search' => "%{$_GET['search']}%"]); + $result = $statement->fetchAll(); + + foreach ($result as $row) { + ?> +
+ +

+ +
+