From abe71674658541519389cc50cbf6ace7d5287c25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Moritz=20H=C3=B6lting?=
 <87192362+moritz-hoelting@users.noreply.github.com>
Date: Sat, 21 Mar 2026 22:14:47 +0100
Subject: [PATCH] block ai crawlers in robots.txt

---
 astro.config.mjs        |   2 +-
 src/pages/robots.txt.ts | 174 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 175 insertions(+), 1 deletion(-)

diff --git a/astro.config.mjs b/astro.config.mjs
index f62addf..39a3046 100644
--- a/astro.config.mjs
+++ b/astro.config.mjs
@@ -15,7 +15,7 @@ export default defineConfig({
     trailingSlash: "always",
 
     integrations: [
-        sitemap(),
+        sitemap({ lastmod: new Date() }),
         mermaid({
             autoTheme: true,
             mermaidConfig: {
diff --git a/src/pages/robots.txt.ts b/src/pages/robots.txt.ts
index a97e22a..a4990eb 100644
--- a/src/pages/robots.txt.ts
+++ b/src/pages/robots.txt.ts
@@ -1,7 +1,181 @@
 import type { APIRoute } from "astro";
 
+const aibotUserAgents = [
+    "AddSearchBot",
+    "AI2Bot",
+    "AI2Bot-DeepResearchEval",
+    "Ai2Bot-Dolma",
+    "aiHitBot",
+    "amazon-kendra",
+    "Amazonbot",
+    "AmazonBuyForMe",
+    "Amzn-SearchBot",
+    "Amzn-User",
+    "Andibot",
+    "Anomura",
+    "anthropic-ai",
+    "ApifyBot",
+    "ApifyWebsiteContentCrawler",
+    "Applebot",
+    "Applebot-Extended",
+    "atlassian-bot",
+    "Awario",
+    "AzureAI-SearchBot",
+    "bedrockbot",
+    "bigsur.ai",
+    "Bravebot",
+    "Brightbot 1.0",
+    "BuddyBot",
+    "Bytespider",
+    "CCBot",
+    "Channel3Bot",
+    "ChatGLM-Spider",
+    "ChatGPT Agent",
+    "ChatGPT-User",
+    "Claude-SearchBot",
+    "Claude-User",
+    "Claude-Web",
+    "ClaudeBot",
+    "Cloudflare-AutoRAG",
+    "CloudVertexBot",
+    "cohere-ai",
+    "cohere-training-data-crawler",
+    "Cotoyogi",
+    "Crawl4AI",
+    "Crawlspace",
+    "Datenbank Crawler",
+    "DeepSeekBot",
+    "Devin",
+    "Diffbot",
+    "DuckAssistBot",
+    "Echobot Bot",
+    "EchoboxBot",
+    "ExaBot",
+    "FacebookBot",
+    "facebookexternalhit",
+    "Factset_spyderbot",
+    "FirecrawlAgent",
+    "FriendlyCrawler",
+    "Gemini-Deep-Research",
+    "Google-CloudVertexBot",
+    "Google-Extended",
+    "Google-Firebase",
+    "Google-NotebookLM",
+    "GoogleAgent-Mariner",
+    "GoogleOther",
+    "GoogleOther-Image",
+    "GoogleOther-Video",
+    "GPTBot",
+    "iAskBot",
+    "iaskspider",
+    "iaskspider/2.0",
+    "IbouBot",
+    "ICC-Crawler",
+    "ImagesiftBot",
+    "imageSpider",
+    "img2dataset",
+    "ISSCyberRiskCrawler",
+    "kagi-fetcher",
+    "Kangaroo Bot",
+    "KlaviyoAIBot",
+    "KunatoCrawler",
+    "laion-huggingface-processor",
+    "LAIONDownloader",
+    "LCC",
+    "LinerBot",
+    "Linguee Bot",
+    "LinkupBot",
+    "Manus-User",
+    "meta-externalagent",
+    "Meta-ExternalAgent",
+    "meta-externalfetcher",
+    "Meta-ExternalFetcher",
+    "meta-webindexer",
+    "MistralAI-User",
+    "MistralAI-User/1.0",
+    "MyCentralAIScraperBot",
+    "netEstate Imprint Crawler",
+    "NotebookLM",
+    "NovaAct",
+    "OAI-SearchBot",
+    "omgili",
+    "omgilibot",
+    "OpenAI",
+    "Operator",
+    "PanguBot",
+    "Panscient",
+    "panscient.com",
+    "Perplexity-User",
+    "PerplexityBot",
+    "PetalBot",
+    "PhindBot",
+    "Poggio-Citations",
+    "Poseidon Research Crawler",
+    "QualifiedBot",
+    "QuillBot",
+    "quillbot.com",
+    "SBIntuitionsBot",
+    "Scrapy",
+    "SemrushBot-OCOB",
+    "SemrushBot-SWA",
+    "ShapBot",
+    "Sidetrade indexer bot",
+    "Spider",
+    "TavilyBot",
+    "TerraCotta",
+    "Thinkbot",
+    "TikTokSpider",
+    "Timpibot",
+    "TwinAgent",
+    "VelenPublicWebCrawler",
+    "WARDBot",
+    "Webzio-Extended",
+    "webzio-extended",
+    "wpbot",
+    "WRTNBot",
+    "YaK",
+    "YandexAdditional",
+    "YandexAdditionalBot",
+    "YouBot",
+    "ZanistaBot",
+];
+
 const robotsTxt = `
+# As a condition of accessing this website, you agree to
+# abide by the following content signals:
+
+# (a)  If a content-signal = yes, you may collect content
+# for the corresponding use.
+# (b)  If a content-signal = no, you may not collect content
+# for the corresponding use.
+# (c)  If the website operator does not include a content
+# signal for a corresponding use, the website operator
+# neither grants nor restricts permission via content signal
+# with respect to the corresponding use.
+
+# The content signals and their meanings are:
+
+# search: building a search index and providing search
+# results (e.g., returning hyperlinks and short excerpts
+# from your website's contents).  Search does not include
+# providing AI-generated search summaries.
+# ai-input: inputting content into one or more AI models
+# (e.g., retrieval augmented generation, grounding, or other
+# real-time taking of content for generative AI search
+# answers).
+# ai-train: training or fine-tuning AI models.
+
+# ANY RESTRICTIONS EXPRESSED VIA CONTENT SIGNALS ARE EXPRESS
+# RESERVATIONS OF RIGHTS UNDER ARTICLE 4 OF THE EUROPEAN
+# UNION DIRECTIVE 2019/790 ON COPYRIGHT AND RELATED RIGHTS
+# IN THE DIGITAL SINGLE MARKET.
+
+${aibotUserAgents.map((userAgent) => `User-agent: ${userAgent}`).join("\n")}
+Disallow: /
+
 User-agent: *
+Content-Signal: ai-train=no, search=yes, ai-input=no
+Disallow: /qualifications/
 Allow: /
 
 Sitemap: ${new URL("sitemap-index.xml", import.meta.env.SITE).href}