diff options
author | Daniel Weipert <git@mail.dweipert.de> | 2024-07-19 14:07:32 +0200 |
---|---|---|
committer | Daniel Weipert <git@mail.dweipert.de> | 2024-07-19 14:07:32 +0200 |
commit | ef1e70b5d73fec98d8b3be96c9a776ff36566d2c (patch) | |
tree | 4bf8f7db6440b1db980300e7fd233093b4114221 |
initial commit
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | index.php | 133 |
2 files changed, 134 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9b1dffd --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.sqlite diff --git a/index.php b/index.php new file mode 100644 index 0000000..c9bb48c --- /dev/null +++ b/index.php @@ -0,0 +1,133 @@ +<?php + +/* + **** + * DB + **** + */ + +$dbType = $_ENV['DB_TYPE'] ?? 'sqlite'; +$dbPath = $_ENV['DB_PATH'] ?? './searcher.sqlite'; +$dbHost = $_ENV['DB_HOST'] ?? ''; +$dbPort = $_ENV['DB_PORT'] ?? ''; +$dbName = $_ENV['DB_NAME'] ?? ''; +$dbUsername = $_ENV['DB_USERNAME'] ?? ''; +$dbPassword = $_ENV['DB_PASSWORD'] ?? ''; + +$dsnParts = [ + 'host' => $dbHost, + 'port' => $dbPort, + 'dbname' => $dbName, +]; +$dsnParts = array_filter($dsnParts, fn ($value) => ! empty($value)); +$dsnParts = array_map(function ($key, $value) { + return "$key=$value"; +}, array_keys($dsnParts), $dsnParts); + +$dsn = "$dbType:" . ($dbType == 'sqlite' ? $dbPath : implode(';', $dsnParts)); + +global $db; +$db = new PDO( + $dsn, + $dbUsername, + $dbPassword, + [ + PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC, + ], +); + +if (isset($_GET['init--db'])) { + $db->query(<<<SQL + create table if not exists search_index ( + id integer primary key, + url text unqiue, + content text, + excerpt text, + title text, + timestamp integer + ); + SQL) + ->execute(); +} + + + +/* + **** + * Crawler + **** + */ + +if (! empty($_GET['crawl'])) { + $targetToCrawl = $_GET['crawl']; + $targetUrl = parse_url($targetToCrawl); + $baseUrl = "{$targetUrl['scheme']}://{$targetUrl['host']}"; + + $streamContext = stream_context_create([ + 'http' => [ + 'follow_location' => true, + ], + ]); + + $responseCode = intval(explode(' ', get_headers("$baseUrl/sitemap.xml")[0])[1]); + if ($responseCode == 200) { + } else { + $dom = new DOMDocument(); + @$dom->loadHTML(file_get_contents($targetToCrawl, false, $streamContext)); + $xpath = new DomXPath($dom); + + $links = $xpath->query('//a'); + foreach ($links as $link) { + /*@var DOMNode $link*/ + $href = $link->attributes->getNamedItem('href')->nodeValue; + $fullPath = str_starts_with($href, 'http') ? $href : "$baseUrl$href"; + + $linkUrl = parse_url($fullPath); + if ($linkUrl['host'] != $targetUrl['host']) { + continue; + } + + $content = file_get_contents($fullPath, false, $streamContext); + $content = strip_tags($content); + + $db->prepare(<<<SQL + insert into search_index (url, content, excerpt, title, timestamp) values (:url, :content, :excerpt, :title, :timestamp) + SQL) + ->execute([ + 'url' => $fullPath, + 'content' => $content, + 'excerpt' => 'EXCERPT', + 'title' => 'TITLE', + 'timestamp' => time(), + ]); + } + } + + // TODO: check sitemap first, only check those links then + // TODO: otherwise get every link on site and crawl that + // TODO: check if link (without query params?) already visited for recursive protection + // strip html => store text in db +} + + + +if (isset($_GET['search'])) { + if (! empty($_GET['search'])) { + $statement = $db->prepare(<<<SQL + select * from search_index + where content like :search + SQL); + $statement->execute(['search' => "%{$_GET['search']}%"]); + $result = $statement->fetchAll(); + + foreach ($result as $row) { + ?> + <div> + <a href="<?php echo $row['url']; ?>"><?php echo $row['title']; ?></a> + <p><?php echo $row['excerpt']; ?></p> + <a href="<?php echo $row['url']; ?>"><?php echo $row['url']; ?></a> + </div> + <?php + } + } +} |