From abe71674658541519389cc50cbf6ace7d5287c25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20H=C3=B6lting?= <87192362+moritz-hoelting@users.noreply.github.com> Date: Sat, 21 Mar 2026 22:14:47 +0100 Subject: [PATCH] block ai crawlers in robots.txt --- astro.config.mjs | 2 +- src/pages/robots.txt.ts | 174 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 175 insertions(+), 1 deletion(-) diff --git a/astro.config.mjs b/astro.config.mjs index f62addf..39a3046 100644 --- a/astro.config.mjs +++ b/astro.config.mjs @@ -15,7 +15,7 @@ export default defineConfig({ trailingSlash: "always", integrations: [ - sitemap(), + sitemap({ lastmod: new Date() }), mermaid({ autoTheme: true, mermaidConfig: { diff --git a/src/pages/robots.txt.ts b/src/pages/robots.txt.ts index a97e22a..a4990eb 100644 --- a/src/pages/robots.txt.ts +++ b/src/pages/robots.txt.ts @@ -1,7 +1,181 @@ import type { APIRoute } from "astro"; +const aibotUserAgents = [ + "AddSearchBot", + "AI2Bot", + "AI2Bot-DeepResearchEval", + "Ai2Bot-Dolma", + "aiHitBot", + "amazon-kendra", + "Amazonbot", + "AmazonBuyForMe", + "Amzn-SearchBot", + "Amzn-User", + "Andibot", + "Anomura", + "anthropic-ai", + "ApifyBot", + "ApifyWebsiteContentCrawler", + "Applebot", + "Applebot-Extended", + "atlassian-bot", + "Awario", + "AzureAI-SearchBot", + "bedrockbot", + "bigsur.ai", + "Bravebot", + "Brightbot 1.0", + "BuddyBot", + "Bytespider", + "CCBot", + "Channel3Bot", + "ChatGLM-Spider", + "ChatGPT Agent", + "ChatGPT-User", + "Claude-SearchBot", + "Claude-User", + "Claude-Web", + "ClaudeBot", + "Cloudflare-AutoRAG", + "CloudVertexBot", + "cohere-ai", + "cohere-training-data-crawler", + "Cotoyogi", + "Crawl4AI", + "Crawlspace", + "Datenbank Crawler", + "DeepSeekBot", + "Devin", + "Diffbot", + "DuckAssistBot", + "Echobot Bot", + "EchoboxBot", + "ExaBot", + "FacebookBot", + "facebookexternalhit", + "Factset_spyderbot", + "FirecrawlAgent", + "FriendlyCrawler", + "Gemini-Deep-Research", + "Google-CloudVertexBot", + "Google-Extended", + "Google-Firebase", + "Google-NotebookLM", + "GoogleAgent-Mariner", + "GoogleOther", + "GoogleOther-Image", + "GoogleOther-Video", + "GPTBot", + "iAskBot", + "iaskspider", + "iaskspider/2.0", + "IbouBot", + "ICC-Crawler", + "ImagesiftBot", + "imageSpider", + "img2dataset", + "ISSCyberRiskCrawler", + "kagi-fetcher", + "Kangaroo Bot", + "KlaviyoAIBot", + "KunatoCrawler", + "laion-huggingface-processor", + "LAIONDownloader", + "LCC", + "LinerBot", + "Linguee Bot", + "LinkupBot", + "Manus-User", + "meta-externalagent", + "Meta-ExternalAgent", + "meta-externalfetcher", + "Meta-ExternalFetcher", + "meta-webindexer", + "MistralAI-User", + "MistralAI-User/1.0", + "MyCentralAIScraperBot", + "netEstate Imprint Crawler", + "NotebookLM", + "NovaAct", + "OAI-SearchBot", + "omgili", + "omgilibot", + "OpenAI", + "Operator", + "PanguBot", + "Panscient", + "panscient.com", + "Perplexity-User", + "PerplexityBot", + "PetalBot", + "PhindBot", + "Poggio-Citations", + "Poseidon Research Crawler", + "QualifiedBot", + "QuillBot", + "quillbot.com", + "SBIntuitionsBot", + "Scrapy", + "SemrushBot-OCOB", + "SemrushBot-SWA", + "ShapBot", + "Sidetrade indexer bot", + "Spider", + "TavilyBot", + "TerraCotta", + "Thinkbot", + "TikTokSpider", + "Timpibot", + "TwinAgent", + "VelenPublicWebCrawler", + "WARDBot", + "Webzio-Extended", + "webzio-extended", + "wpbot", + "WRTNBot", + "YaK", + "YandexAdditional", + "YandexAdditionalBot", + "YouBot", + "ZanistaBot", +]; + const robotsTxt = ` +# As a condition of accessing this website, you agree to +# abide by the following content signals: + +# (a) If a content-signal = yes, you may collect content +# for the corresponding use. +# (b) If a content-signal = no, you may not collect content +# for the corresponding use. +# (c) If the website operator does not include a content +# signal for a corresponding use, the website operator +# neither grants nor restricts permission via content signal +# with respect to the corresponding use. + +# The content signals and their meanings are: + +# search: building a search index and providing search +# results (e.g., returning hyperlinks and short excerpts +# from your website's contents). Search does not include +# providing AI-generated search summaries. +# ai-input: inputting content into one or more AI models +# (e.g., retrieval augmented generation, grounding, or other +# real-time taking of content for generative AI search +# answers). +# ai-train: training or fine-tuning AI models. + +# ANY RESTRICTIONS EXPRESSED VIA CONTENT SIGNALS ARE EXPRESS +# RESERVATIONS OF RIGHTS UNDER ARTICLE 4 OF THE EUROPEAN +# UNION DIRECTIVE 2019/790 ON COPYRIGHT AND RELATED RIGHTS +# IN THE DIGITAL SINGLE MARKET. + +${aibotUserAgents.map((userAgent) => `User-agent: ${userAgent}`).join("\n")} +Disallow: / + User-agent: * +Content-Signal: ai-train=no, search=yes, ai-input=no +Disallow: /qualifications/ Allow: / Sitemap: ${new URL("sitemap-index.xml", import.meta.env.SITE).href}