2 files changed, 134 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9b1dffd
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.sqlite
diff --git a/index.php b/index.php
new file mode 100644
index 0000000..c9bb48c
--- /dev/null
+++ b/index.php
@@ -0,0 +1,133 @@
+<?php
+
+/*
+ ****
+ * DB
+ ****
+ */
+
+$dbType = $_ENV['DB_TYPE'] ?? 'sqlite';
+$dbPath = $_ENV['DB_PATH'] ?? './searcher.sqlite';
+$dbHost = $_ENV['DB_HOST'] ?? '';
+$dbPort = $_ENV['DB_PORT'] ?? '';
+$dbName = $_ENV['DB_NAME'] ?? '';
+$dbUsername = $_ENV['DB_USERNAME'] ?? '';
+$dbPassword = $_ENV['DB_PASSWORD'] ?? '';
+
+$dsnParts = [
+  'host' => $dbHost,
+  'port' => $dbPort,
+  'dbname' => $dbName,
+];
+$dsnParts = array_filter($dsnParts, fn ($value) => ! empty($value));
+$dsnParts = array_map(function ($key, $value) {
+  return "$key=$value";
+}, array_keys($dsnParts), $dsnParts);
+
+$dsn = "$dbType:" . ($dbType == 'sqlite' ? $dbPath : implode(';', $dsnParts));
+
+global $db;
+$db = new PDO(
+  $dsn,
+  $dbUsername,
+  $dbPassword,
+  [
+    PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC,
+  ],
+);
+
+if (isset($_GET['init--db'])) {
+  $db->query(<<<SQL
+    create table if not exists search_index (
+      id integer primary key,
+      url text unqiue,
+      content text,
+      excerpt text,
+      title text,
+      timestamp integer
+    );
+    SQL)
+    ->execute();
+}
+
+
+
+/*
+ ****
+ * Crawler
+ ****
+ */
+
+if (! empty($_GET['crawl'])) {
+  $targetToCrawl = $_GET['crawl'];
+  $targetUrl = parse_url($targetToCrawl);
+  $baseUrl = "{$targetUrl['scheme']}://{$targetUrl['host']}";
+
+  $streamContext = stream_context_create([
+    'http' => [
+      'follow_location' => true,
+    ],
+  ]);
+
+  $responseCode = intval(explode(' ', get_headers("$baseUrl/sitemap.xml")[0])[1]);
+  if ($responseCode == 200) {
+  } else {
+    $dom = new DOMDocument();
+    @$dom->loadHTML(file_get_contents($targetToCrawl, false, $streamContext));
+    $xpath = new DomXPath($dom);
+
+    $links = $xpath->query('//a');
+    foreach ($links as $link) {
+      /*@var DOMNode $link*/
+      $href = $link->attributes->getNamedItem('href')->nodeValue;
+      $fullPath = str_starts_with($href, 'http') ? $href : "$baseUrl$href";
+
+      $linkUrl = parse_url($fullPath);
+      if ($linkUrl['host'] != $targetUrl['host']) {
+        continue;
+      }
+
+      $content = file_get_contents($fullPath, false, $streamContext);
+      $content = strip_tags($content);
+
+      $db->prepare(<<<SQL
+        insert into search_index (url, content, excerpt, title, timestamp) values (:url, :content, :excerpt, :title, :timestamp)
+        SQL)
+        ->execute([
+          'url' => $fullPath,
+          'content' => $content,
+          'excerpt' => 'EXCERPT',
+          'title' => 'TITLE',
+          'timestamp' => time(),
+        ]);
+    }
+  }
+
+  // TODO: check sitemap first, only check those links then
+  // TODO: otherwise get every link on site and crawl that
+  // TODO: check if link (without query params?) already visited for recursive protection
+  // strip html => store text in db
+}
+
+
+
+if (isset($_GET['search'])) {
+  if (! empty($_GET['search'])) {
+    $statement = $db->prepare(<<<SQL
+      select * from search_index
+      where content like :search
+      SQL);
+    $statement->execute(['search' => "%{$_GET['search']}%"]);
+    $result = $statement->fetchAll();
+
+    foreach ($result as $row) {
+      ?>
+      <div>
+        <a href="<?php echo $row['url']; ?>"><?php echo $row['title']; ?></a>
+        <p><?php echo $row['excerpt']; ?></p>
+        <a href="<?php echo $row['url']; ?>"><?php echo $row['url']; ?></a>
+      </div>
+      <?php
+    }
+  }
+}