# ============================================================
# KNOWLEDGE BASE SPEC — Product Catalog
# ============================================================
# A KnowledgeBase defines a data source for Retrieval-Augmented
# Generation (RAG). It declares where the data lives, how it gets
# ingested and embedded, and how agents retrieve from it.
#
# Agents bind to a KnowledgeBase by referencing its name in their
# knowledgeBases[] field. The KB spec controls everything about
# how that retrieval works — the agent just queries it.
# ============================================================

apiVersion: agents.platform.io/v1
kind: KnowledgeBase

# ── Identity & Ownership ──
metadata:
  name: product-catalog-kb
  displayName: "Product Catalog Knowledge Base"
  team: commerce-platform
  version: 1.5.0
  labels:
    domain: retail
    data-class: internal

context:
  environment: production

spec:
  # ── Type ──
  # The kind of retrieval backend. Options:
  #   vector   — embeddings + similarity search (most common)
  #   graph    — knowledge graph with entity relationships
  #   hybrid   — vector + keyword search combined
  type: vector

  # ── Data Connection ──
  # Where the source data lives and how to connect to it.
  connection:
    backend: postgres          # postgres | cosmosdb | pinecone | weaviate | qdrant
    config:
      host: vectordb.internal
      port: 5432
      database: product_embeddings
      schema: public
    auth:
      secretRef: vectordb-credentials
    ssl:
      enabled: true

  # ── RAG Configuration ──
  # The full retrieval pipeline: ingestion, embedding, and retrieval.
  rag:
    # ── Ingestion ──
    # How source documents get processed before embedding.
    ingestion:
      sources:
        - type: database
          connection: product-db-prod
          query: "SELECT sku, name, description, category, price FROM products WHERE active = true"
          schedule: "0 2 * * *"    # nightly refresh at 2 AM

        - type: s3
          bucket: product-documents
          prefix: "catalogs/"
          fileTypes: [pdf, csv]
          schedule: "0 3 * * 0"    # weekly on Sundays

      # How documents get chunked before embedding.
      chunking:
        strategy: recursive        # recursive | fixed | semantic | sentence
        config:
          chunkSize: 512
          chunkOverlap: 64
          separators: ["\n\n", "\n", ". "]

      # Content preprocessing before chunking.
      preprocessing:
        - type: metadata_extraction
          fields: [sku, category, price_range]
        - type: html_strip
        - type: normalize_whitespace

    # ── Embedding ──
    # The model used to convert text chunks into vectors.
    embedding:
      model: text-embedding-3-small
      provider: openai
      dimensions: 1536
      batchSize: 100
      gatewayRef: llm-gateway-prod   # route through the LLM gateway

    # ── Retrieval ──
    # How agents query the knowledge base at runtime.
    retrieval:
      topK: 10                       # REQUIRED — number of results to return
      scoreThreshold: 0.72           # minimum similarity score
      searchType: mmr                # similarity | mmr | hybrid

      # Optional: metadata filtering at query time.
      filters:
        enabled: true
        allowedFields: [category, price_range, sku]

      # Optional: reranking for better relevance.
      reranking:
        enabled: true
        model: cross-encoder/ms-marco-MiniLM-L-6-v2
        topN: 5                      # rerank topK down to topN

      # Optional: query transformation before retrieval.
      queryTransformation:
        enabled: true
        strategies:
          - type: hyde               # hypothetical document embedding
          - type: multi_query        # generate multiple search queries

  # ── Security & Compliance ──
  # Data governance controls. Required at Gold conformance profile
  # for any KB handling sensitive data.
  security:
    accessControl:
      type: rbac
      allowedTeams:
        - commerce-platform
        - customer-support
    compliance:
      dataRetention: 365d
      auditLog: enabled
    redaction:
      enabled: false               # set to true for PII/PHI data
      rules: []