summaryrefslogtreecommitdiff
path: root/index.php
diff options
context:
space:
mode:
authorDaniel Weipert <git@mail.dweipert.de>2024-07-19 14:07:32 +0200
committerDaniel Weipert <git@mail.dweipert.de>2024-07-19 14:07:32 +0200
commitef1e70b5d73fec98d8b3be96c9a776ff36566d2c (patch)
tree4bf8f7db6440b1db980300e7fd233093b4114221 /index.php
initial commit
Diffstat (limited to 'index.php')
-rw-r--r--index.php133
1 files changed, 133 insertions, 0 deletions
diff --git a/index.php b/index.php
new file mode 100644
index 0000000..c9bb48c
--- /dev/null
+++ b/index.php
@@ -0,0 +1,133 @@
+<?php
+
+/*
+ ****
+ * DB
+ ****
+ */
+
+$dbType = $_ENV['DB_TYPE'] ?? 'sqlite';
+$dbPath = $_ENV['DB_PATH'] ?? './searcher.sqlite';
+$dbHost = $_ENV['DB_HOST'] ?? '';
+$dbPort = $_ENV['DB_PORT'] ?? '';
+$dbName = $_ENV['DB_NAME'] ?? '';
+$dbUsername = $_ENV['DB_USERNAME'] ?? '';
+$dbPassword = $_ENV['DB_PASSWORD'] ?? '';
+
+$dsnParts = [
+ 'host' => $dbHost,
+ 'port' => $dbPort,
+ 'dbname' => $dbName,
+];
+$dsnParts = array_filter($dsnParts, fn ($value) => ! empty($value));
+$dsnParts = array_map(function ($key, $value) {
+ return "$key=$value";
+}, array_keys($dsnParts), $dsnParts);
+
+$dsn = "$dbType:" . ($dbType == 'sqlite' ? $dbPath : implode(';', $dsnParts));
+
+global $db;
+$db = new PDO(
+ $dsn,
+ $dbUsername,
+ $dbPassword,
+ [
+ PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC,
+ ],
+);
+
+if (isset($_GET['init--db'])) {
+ $db->query(<<<SQL
+ create table if not exists search_index (
+ id integer primary key,
+ url text unqiue,
+ content text,
+ excerpt text,
+ title text,
+ timestamp integer
+ );
+ SQL)
+ ->execute();
+}
+
+
+
+/*
+ ****
+ * Crawler
+ ****
+ */
+
+if (! empty($_GET['crawl'])) {
+ $targetToCrawl = $_GET['crawl'];
+ $targetUrl = parse_url($targetToCrawl);
+ $baseUrl = "{$targetUrl['scheme']}://{$targetUrl['host']}";
+
+ $streamContext = stream_context_create([
+ 'http' => [
+ 'follow_location' => true,
+ ],
+ ]);
+
+ $responseCode = intval(explode(' ', get_headers("$baseUrl/sitemap.xml")[0])[1]);
+ if ($responseCode == 200) {
+ } else {
+ $dom = new DOMDocument();
+ @$dom->loadHTML(file_get_contents($targetToCrawl, false, $streamContext));
+ $xpath = new DomXPath($dom);
+
+ $links = $xpath->query('//a');
+ foreach ($links as $link) {
+ /*@var DOMNode $link*/
+ $href = $link->attributes->getNamedItem('href')->nodeValue;
+ $fullPath = str_starts_with($href, 'http') ? $href : "$baseUrl$href";
+
+ $linkUrl = parse_url($fullPath);
+ if ($linkUrl['host'] != $targetUrl['host']) {
+ continue;
+ }
+
+ $content = file_get_contents($fullPath, false, $streamContext);
+ $content = strip_tags($content);
+
+ $db->prepare(<<<SQL
+ insert into search_index (url, content, excerpt, title, timestamp) values (:url, :content, :excerpt, :title, :timestamp)
+ SQL)
+ ->execute([
+ 'url' => $fullPath,
+ 'content' => $content,
+ 'excerpt' => 'EXCERPT',
+ 'title' => 'TITLE',
+ 'timestamp' => time(),
+ ]);
+ }
+ }
+
+ // TODO: check sitemap first, only check those links then
+ // TODO: otherwise get every link on site and crawl that
+ // TODO: check if link (without query params?) already visited for recursive protection
+ // strip html => store text in db
+}
+
+
+
+if (isset($_GET['search'])) {
+ if (! empty($_GET['search'])) {
+ $statement = $db->prepare(<<<SQL
+ select * from search_index
+ where content like :search
+ SQL);
+ $statement->execute(['search' => "%{$_GET['search']}%"]);
+ $result = $statement->fetchAll();
+
+ foreach ($result as $row) {
+ ?>
+ <div>
+ <a href="<?php echo $row['url']; ?>"><?php echo $row['title']; ?></a>
+ <p><?php echo $row['excerpt']; ?></p>
+ <a href="<?php echo $row['url']; ?>"><?php echo $row['url']; ?></a>
+ </div>
+ <?php
+ }
+ }
+}