# Who's In - Platform for Every Gathering
# https://whos-in.app
# Updated: 2026-05-05

# ========================================
# Content Signals (contentsignals.org, draft-romm-aipref-contentsignals)
# ========================================
# Who's In welcomes AI visibility. Organisers want their public events
# found by AI search, cited as context, and surfaced in training data —
# more discovery = more RSVPs. Private routes (/dashboard, /profile, etc.)
# remain Disallow'd below and are never part of AI-accessible content.
# The Content-Signal directive lives inside the User-agent: * group below
# (a top-level/group-less duplicate was removed 2026-06-14, P2-12 — robots.txt
# group-less directives before the first User-agent line are non-standard and
# the in-group line is the canonical signal).

# ========================================
# DEFAULT POLICY: allow everything, block private routes
# ========================================
User-agent: *
Content-Signal: ai-train=yes, search=yes, ai-input=yes
Allow: /
Allow: /llms.txt
Allow: /llms-full.txt
Allow: /ai.txt
Disallow: /api/
Disallow: /__/
Disallow: /dashboard
Disallow: /profile
Disallow: /settings
Disallow: /account
Disallow: /event/*/edit
Disallow: /members
Disallow: /redeem
Disallow: /admin
Disallow: /checkin/
Disallow: /survey/
Disallow: /auth/
Disallow: /unsubscribe
Disallow: /support/
Disallow: /clubs/create
Disallow: /create
Disallow: /add-product
Disallow: /my-clubs
Disallow: /analytics
Disallow: /demo/
Disallow: /preview/
Disallow: /carousel/

# ========================================
# SEARCH ENGINE CRAWLERS
# ========================================
# Explicitly listed for clarity (already allowed by default rule above)

User-agent: Googlebot
Allow: /
# /og/ = machine-only social-preview PNG endpoints (one per event/page). No search
# value; they were flooding "Crawled - currently not indexed". Block search crawl here
# (search bots read only their own group, so social scrapers under User-agent:* keep
# fetching /og/ for share previews). Also X-Robots-Tag:noindex via firebase.json. (2026-06-21)
Disallow: /og/

User-agent: Bingbot
Allow: /
Disallow: /og/

User-agent: Applebot
Allow: /

User-agent: DuckDuckBot
Allow: /

User-agent: DuckAssistBot
Allow: /

User-agent: Slurp
Allow: /

User-agent: Yandex
Allow: /

User-agent: Baiduspider
Allow: /

# ========================================
# AI/LLM CRAWLERS — ALL WELCOME
# ========================================

# OpenAI
User-agent: GPTBot
Allow: /

User-agent: ChatGPT-User
Allow: /

User-agent: OAI-SearchBot
Allow: /

# Anthropic (Claude)
User-agent: ClaudeBot
Allow: /

User-agent: Claude-Web
Allow: /

User-agent: claude-web
Allow: /

User-agent: Claude-User
Allow: /

User-agent: Claude-SearchBot
Allow: /

User-agent: anthropic-ai
Allow: /

# Google AI (Gemini)
User-agent: Google-Extended
Allow: /

User-agent: Googlebot-AI
Allow: /

User-agent: GoogleOther
Allow: /

User-agent: GoogleOther-Image
Allow: /

User-agent: GoogleOther-Video
Allow: /

User-agent: google-vertex-ai
Allow: /

User-agent: GoogleAgent-Mariner
Allow: /

User-agent: Google-CloudVertexBot
Allow: /

# DeepSeek
User-agent: DeepSeekBot
Allow: /

User-agent: deepseek-ai
Allow: /

# Qwen (Alibaba)
User-agent: QwenBot
Allow: /

User-agent: Qwen-SearchBot
Allow: /

# Apple AI (Apple Intelligence / Siri)
User-agent: Applebot-Extended
Allow: /

# Microsoft/Bing AI
User-agent: bingbot
Allow: /

# Perplexity (full UA: PerplexityBot/1.0)
User-agent: PerplexityBot
Allow: /

User-agent: PerplexityBot/1.0
Allow: /

User-agent: Perplexity-User
Allow: /

# You.com
User-agent: YouBot
Allow: /

# Cohere
User-agent: Cohere-Crawler
Allow: /

User-agent: cohere-ai
Allow: /

# Meta/Facebook AI
User-agent: Meta-ExternalAgent
Allow: /

User-agent: meta-externalagent
Allow: /

User-agent: Meta-WebIndexer
Allow: /

User-agent: meta-webindexer
Allow: /

User-agent: Meta-ExternalFetcher
Allow: /

User-agent: FacebookBot
Allow: /

# Amazon
User-agent: Amazonbot
Allow: /

# Common Crawl (used for AI training)
User-agent: CCBot
Allow: /

# AI21 Labs
User-agent: AI2Bot
Allow: /

# Hugging Face
User-agent: HuggingFaceBot
Allow: /

# Mistral AI
User-agent: MistralBot
Allow: /

User-agent: MistralAI-User
Allow: /

# xAI (Grok)
User-agent: Grokbot
Allow: /

User-agent: xAI-Bot
Allow: /

# Brave Search (powers Claude web search — no special UA, uses Googlebot-accessible pages)
# BraveBot is Brave's optional crawler; Brave Search primarily re-indexes Google-accessible content
User-agent: BraveBot
Allow: /

# Neeva (AI search)
User-agent: NeevaBot
Allow: /

# Phind (developer AI)
User-agent: PhindBot
Allow: /

# ByteDance / TikTok
User-agent: Bytespider
Allow: /

User-agent: TikTokSpider
Allow: /

# Diffbot (structured data extraction)
User-agent: Diffbot
Allow: /

# Firecrawl (web-to-markdown for LLMs)
User-agent: FirecrawlAgent
Allow: /

# Baidu AI
User-agent: Petalbot
Allow: /

# Tencent AI
User-agent: PanguBot
Allow: /

# Omgili / Webz.io (data intelligence)
User-agent: Omgili
Allow: /

User-agent: Omgilibot
Allow: /

# AI21 Labs (Dolma variant)
User-agent: AI2Bot-Dolma
Allow: /

# Cohere (training-specific crawler)
User-agent: cohere-training-data-crawler
Allow: /

# Seekr (AI trust & safety)
User-agent: Seekr
Allow: /

# ICC-Crawler (internet content crawler)
User-agent: ICC-Crawler
Allow: /

# Timpi (decentralized search)
User-agent: Timpibot
Allow: /

# ImagesiftBot (visual AI)
User-agent: ImagesiftBot
Allow: /

# VelenPublicWebCrawler
User-agent: VelenPublicWebCrawler
Allow: /

# Peer39 (contextual intelligence)
User-agent: peer39_crawler
Allow: /

# aiHitBot (AI indexing)
User-agent: aiHitBot
Allow: /

# Meltwater (media intelligence)
User-agent: Meltwater
Allow: /

# SEO data crawlers (allow — they drive competitor comparison visibility)
User-agent: SemrushBot-OCOB
Allow: /

User-agent: DataForSeoBot
Allow: /

# img2dataset (image training — allow for brand visibility)
User-agent: img2dataset
Allow: /

# ========================================
# COMMERCIAL SEO SCRAPERS — BLOCKED
# ========================================
# These crawl every URL of every site for backlink / competitor analysis
# subscription products. We don't subscribe to any of them, so we get zero
# value while paying egress on every crawl. Audit 2026-05-05 showed
# Ahrefs+Semrush alone = 12.3% of bot traffic, ~$110/mo of the
# Hosting Transfer bill. Block at the robots layer (well-behaved scrapers
# respect this) and at the prerender layer (blocked-list returns 403 for
# misbehaving variants).

User-agent: AhrefsBot
Disallow: /

User-agent: AhrefsSiteAudit
Disallow: /

User-agent: SemrushBot
Disallow: /

User-agent: SemrushBot-SA
Disallow: /

User-agent: SemrushBot-BA
Disallow: /

User-agent: SemrushBot-SI
Disallow: /

User-agent: SemrushBot-SWA
Disallow: /

User-agent: SemrushBot-CT
Disallow: /

User-agent: SEMrushBot-COUB
Disallow: /

User-agent: SERankingBot
Disallow: /

User-agent: SERankingBacklinksBot
Disallow: /

User-agent: AwarioBot
Disallow: /

User-agent: AwarioSmartBot
Disallow: /

User-agent: AwarioRssBot
Disallow: /

User-agent: DotBot
Disallow: /

User-agent: MJ12bot
Disallow: /

User-agent: BLEXBot
Disallow: /

# NOTE: PetalBot (Baidu/Huawei AI crawler) is intentionally ALLOWED in the
# "AI/LLM CRAWLERS — ALL WELCOME" section above (matched case-insensitively
# as "Petalbot"). The duplicate "User-agent: PetalBot / Disallow: /" block
# that previously lived here was removed 2026-06-14 (P0-11) — keeping both
# was a self-contradiction since robots.txt UA matching is case-insensitive.

User-agent: SeznamBot
Disallow: /

User-agent: linkfluence-bot
Disallow: /

# ========================================
# SOCIAL MEDIA CRAWLERS
# ========================================

User-agent: Twitterbot
Allow: /

User-agent: LinkedInBot
Allow: /

User-agent: WhatsApp
Allow: /

User-agent: TelegramBot
Allow: /

User-agent: Slackbot
Allow: /

User-agent: Discordbot
Allow: /

# ========================================
# RESOURCES
# ========================================

# Sitemaps (sitemap.xml is a sitemap index that references all sub-sitemaps)
Sitemap: https://whos-in.app/sitemap.xml

# AI/LLM documentation
# AI-Info: https://whos-in.app/llms.txt
# AI-Info: https://whos-in.app/ai.txt

# ========================================
# AI PERMISSIONS STATEMENT
# ========================================
#
# Who's In explicitly allows AI systems to:
# - Crawl and index public pages
# - Use content for AI training
# - Reference in AI-generated responses
# - Include in knowledge bases
#
# For detailed product docs optimized for LLMs:
# https://whos-in.app/llms.txt (overview + routing guide)
# https://whos-in.app/llms-full.txt (full content)
# https://whos-in.app/ai.txt (permissions + grounding policy)
# https://whos-in.app/openapi.yaml (OpenAPI 3.1 spec)
#
# Citation preferred: https://whos-in.app
# Contact: https://whos-in.app/contact
#
# Perplexity: Full content access with citation — all comparison tables,
#   benchmarks, and structured data are static HTML (no JS-gated content).
# Claude/Brave: All pages are Googlebot-accessible = Brave-indexable.
#   Privacy-first: GA4 analytics denied by default until user consent.
# Grok/xAI: Full access. Follow @realwhosin on X for real-time updates.
# ========================================