coolify/templates/compose/signoz.yaml

# documentation: https://signoz.io/docs/introduction/
# slogan: An observability platform native to OpenTelemetry with logs, traces and metrics.
# tags: telemetry, server, applications, interface, logs, monitoring, traces, metrics
# logo: svgs/signoz.svg
# port: 8080

services:
  init-clickhouse:
    image: clickhouse/clickhouse-server:24.1.2-alpine
    container_name: signoz-init-clickhouse
    command:
      - bash
      - -c
      - |
        version="v0.0.1"
        node_os=$$(uname -s | tr '[:upper:]' '[:lower:]')
        node_arch=$$(uname -m | sed s/aarch64/arm64/ | sed s/x86_64/amd64/)
        echo "Fetching histogram-binary for $${node_os}/$${node_arch}"
        cd /tmp
        wget -O histogram-quantile.tar.gz "https://github.com/SigNoz/signoz/releases/download/histogram-quantile%2F$${version}/histogram-quantile_$${node_os}_$${node_arch}.tar.gz"
        tar -xvzf histogram-quantile.tar.gz
        mkdir -p /var/lib/clickhouse/user_scripts/histogramQuantile
        mv histogram-quantile /var/lib/clickhouse/user_scripts/histogramQuantile
    restart: on-failure
    logging:
      options:
        max-size: 50m
        max-file: "3"

  zookeeper-1:
    image: bitnami/zookeeper:3.7.1
    container_name: signoz-zookeeper-1
    user: root
    healthcheck:
      test:
        - CMD-SHELL
        - curl -s -m 2 http://localhost:8080/commands/ruok | grep error | grep null
      interval: 30s
      timeout: 5s
      retries: 3
    restart: unless-stopped
    logging:
      options:
        max-size: 50m
        max-file: "3"
    volumes:
      - zookeeper-1:/bitnami/zookeeper
    environment:
      - ZOO_SERVER_ID=1
      - ALLOW_ANONYMOUS_LOGIN=yes
      - ZOO_AUTOPURGE_INTERVAL=1
      - ZOO_ENABLE_PROMETHEUS_METRICS=yes
      - ZOO_PROMETHEUS_METRICS_PORT_NUMBER=9141

  clickhouse:
    # addding non LTS version due to this fix https://github.com/ClickHouse/ClickHouse/commit/32caf8716352f45c1b617274c7508c86b7d1afab
    image: clickhouse/clickhouse-server:24.1.2-alpine
    container_name: signoz-clickhouse
    tty: true
    depends_on:
      init-clickhouse:
        condition: service_completed_successfully
      zookeeper-1:
        condition: service_healthy
    healthcheck:
      test:
        - CMD
        - wget
        - --spider
        - -q
        - 0.0.0.0:8123/ping
      interval: 30s
      timeout: 5s
      retries: 3
    ulimits:
      nproc: 65535
      nofile:
        soft: 262144
        hard: 262144
    restart: unless-stopped
    logging:
      options:
        max-size: 50m
        max-file: "3"
    volumes:
      - type: bind
        source: ./clickhouse/config.d/config.xml
        target: /etc/clickhouse-server/config.d/config.xml
        content: |
          <clickhouse>
            <logger>
              <level>information</level>
              <formatting>
                  <type>json</type>
              </formatting>
            </logger>
            <macros>
              <shard>01</shard>
              <replica>example01-01-1</replica>
            </macros>
            <user_defined_executable_functions_config>*function.xml</user_defined_executable_functions_config>
          </clickhouse>
      - type: bind
        source: ./clickhouse/custom-function.xml
        target: /etc/clickhouse-server/custom-function.xml
        content: |
          <functions>
              <function>
                  <type>executable</type>
                  <name>histogramQuantile</name>
                  <return_type>Float64</return_type>
                  <argument>
                      <type>Array(Float64)</type>
                      <name>buckets</name>
                  </argument>
                  <argument>
                      <type>Array(Float64)</type>
                      <name>counts</name>
                  </argument>
                  <argument>
                      <type>Float64</type>
                      <name>quantile</name>
                  </argument>
                  <format>CSV</format>
                  <command>./histogramQuantile</command>
              </function>
          </functions>
      - type: bind
        source: ./clickhouse/cluster.xml
        target: /etc/clickhouse-server/config.d/cluster.xml
        content: |
          <?xml version="1.0"?>
          <clickhouse>
              <!-- ZooKeeper is used to store metadata about replicas, when using Replicated tables.
                  Optional. If you don't use replicated tables, you could omit that.

                  See https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/replication/
                -->
              <zookeeper>
                  <node index="1">
                      <host>zookeeper-1</host>
                      <port>2181</port>
                  </node>
                  <!-- <node index="2">
                      <host>zookeeper-2</host>
                      <port>2181</port>
                  </node>
                  <node index="3">
                      <host>zookeeper-3</host>
                      <port>2181</port>
                  </node> -->
              </zookeeper>

              <!-- Configuration of clusters that could be used in Distributed tables.
                  https://clickhouse.com/docs/en/operations/table_engines/distributed/
                -->
              <remote_servers>
                  <cluster>
                      <!-- Inter-server per-cluster secret for Distributed queries
                          default: no secret (no authentication will be performed)

                          If set, then Distributed queries will be validated on shards, so at least:
                          - such cluster should exist on the shard,
                          - such cluster should have the same secret.

                          And also (and which is more important), the initial_user will
                          be used as current user for the query.

                          Right now the protocol is pretty simple and it only takes into account:
                          - cluster name
                          - query

                          Also it will be nice if the following will be implemented:
                          - source hostname (see interserver_http_host), but then it will depends from DNS,
                            it can use IP address instead, but then the you need to get correct on the initiator node.
                          - target hostname / ip address (same notes as for source hostname)
                          - time-based security tokens
                      -->
                      <!-- <secret></secret> -->
                      <shard>
                          <!-- Optional. Whether to write data to just one of the replicas. Default: false (write data to all replicas). -->
                          <!-- <internal_replication>false</internal_replication> -->
                          <!-- Optional. Shard weight when writing data. Default: 1. -->
                          <!-- <weight>1</weight> -->
                          <replica>
                              <host>clickhouse</host>
                              <port>9000</port>
                              <!-- Optional. Priority of the replica for load_balancing. Default: 1 (less value has more priority). -->
                              <!-- <priority>1</priority> -->
                          </replica>
                      </shard>
                      <!-- <shard>
                          <replica>
                              <host>clickhouse-2</host>
                              <port>9000</port>
                          </replica>
                      </shard>
                      <shard>
                          <replica>
                              <host>clickhouse-3</host>
                              <port>9000</port>
                          </replica>
                      </shard> -->
                  </cluster>
              </remote_servers>
          </clickhouse>
      - type: volume
        source: clickhouse
        target: /var/lib/clickhouse/

  signoz:
    image: signoz/signoz:latest
    container_name: signoz
    depends_on:
      clickhouse:
        condition: service_healthy
      schema-migrator-sync:
        condition: service_completed_successfully
    restart: unless-stopped
    logging:
      options:
        max-size: 50m
        max-file: "3"
    command:
      - --config=/root/config/prometheus.yml
    volumes:
      - type: bind
        source: ./prometheus.yml
        target: /root/config/prometheus.yml
        content: |
          # my global config
          global:
            scrape_interval:     5s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
            evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
            # scrape_timeout is set to the global default (10s).

          # Alertmanager configuration
          alerting:
            alertmanagers:
            - static_configs:
              - targets:
                - alertmanager:9093

          # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
          rule_files: []
            # - "first_rules.yml"
            # - "second_rules.yml"
            # - 'alerts.yml'

          # A scrape configuration containing exactly one endpoint to scrape:
          # Here it's Prometheus itself.
          scrape_configs: []

          remote_read:
            - url: tcp://clickhouse:9000/signoz_metrics
      - type: volume
        source: sqlite
        target: /var/lib/signoz/
    environment:
      - SERVICE_FQDN_SIGNOZ_8080
      - SIGNOZ_ALERTMANAGER_PROVIDER=signoz
      - SIGNOZ_TELEMETRYSTORE_CLICKHOUSE_DSN=tcp://clickhouse:9000
      - SIGNOZ_SQLSTORE_SQLITE_PATH=/var/lib/signoz/signoz.db
      - DASHBOARDS_PATH=/root/config/dashboards
      - STORAGE=clickhouse
      - GODEBUG=netdns=go
      - DEPLOYMENT_TYPE=docker-standalone-amd
      - TELEMETRY_ENABLED=${TELEMETRY_ENABLED:-true}
      - SMTP_ENABLED=${SMTP_ENABLED:-false}
      - SMTP_FROM=${SMTP_FROM}
      - SMTP_HOST=${SMTP_HOST}
      - SMTP_PORT=${SMTP_PORT}
      - SMTP_USERNAME=${SMTP_USERNAME}
      - SMTP_PASSWORD=${SMTP_PASSWORD}
      - SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__AUTH__PASSWORD=${SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__AUTH__PASSWORD}
      - SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__AUTH__USERNAME=${SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__AUTH__USERNAME}
      - SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__FROM=${SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__FROM}
      - SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__SMARTHOST=${SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__SMARTHOST}
    healthcheck:
      test:
        - CMD
        - wget
        - --spider
        - -q
        - localhost:8080/api/v1/health
      interval: 30s
      timeout: 5s
      retries: 3

  otel-collector:
    image: signoz/signoz-otel-collector:latest
    container_name: signoz-otel-collector
    depends_on:
      clickhouse:
        condition: service_healthy
      schema-migrator-sync:
        condition: service_completed_successfully
      signoz:
        condition: service_healthy
    restart: unless-stopped
    logging:
      options:
        max-size: 50m
        max-file: "3"
    ports:
      - "4317:4317" # OTLP gRPC receiver
      - "4318:4318" # OTLP HTTP receiver
    command:
      - --config=/etc/otel-collector-config.yaml
      - --manager-config=/etc/manager-config.yaml
      - --copy-path=/var/tmp/collector-config.yaml
      - --feature-gates=-pkg.translator.prometheus.NormalizeName
    volumes:
      - type: bind
        source: ./otel-collector-config.yaml
        target: /etc/otel-collector-config.yaml
        content: |
          receivers:
            otlp:
              protocols:
                grpc:
                  endpoint: 0.0.0.0:4317
                http:
                  endpoint: 0.0.0.0:4318
            prometheus:
              config:
                global:
                  scrape_interval: 60s
                scrape_configs:
                  - job_name: otel-collector
                    static_configs:
                    - targets:
                        - localhost:8888
                      labels:
                        job_name: otel-collector
          processors:
            batch:
              send_batch_size: 10000
              send_batch_max_size: 11000
              timeout: 10s
            resourcedetection:
              # Using OTEL_RESOURCE_ATTRIBUTES envvar, env detector adds custom labels.
              detectors: [env, system]
              timeout: 2s
            signozspanmetrics/delta:
              metrics_exporter: clickhousemetricswrite
              metrics_flush_interval: 60s
              latency_histogram_buckets: [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s ]
              dimensions_cache_size: 100000
              aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA
              enable_exp_histogram: true
              dimensions:
                - name: service.namespace
                  default: default
                - name: deployment.environment
                  default: default
                # This is added to ensure the uniqueness of the timeseries
                # Otherwise, identical timeseries produced by multiple replicas of
                # collectors result in incorrect APM metrics
                - name: signoz.collector.id
                - name: service.version
                - name: browser.platform
                - name: browser.mobile
                - name: k8s.cluster.name
                - name: k8s.node.name
                - name: k8s.namespace.name
                - name: host.name
                - name: host.type
                - name: container.name
          extensions:
            health_check:
              endpoint: 0.0.0.0:13133
            pprof:
              endpoint: 0.0.0.0:1777
          exporters:
            clickhousetraces:
              datasource: tcp://clickhouse:9000/signoz_traces
              low_cardinal_exception_grouping: ${env:LOW_CARDINAL_EXCEPTION_GROUPING}
              use_new_schema: true
            clickhousemetricswrite:
              endpoint: tcp://clickhouse:9000/signoz_metrics
              resource_to_telemetry_conversion:
                enabled: true
            clickhousemetricswrite/prometheus:
              endpoint: tcp://clickhouse:9000/signoz_metrics
            signozclickhousemetrics:
              dsn: tcp://clickhouse:9000/signoz_metrics
            clickhouselogsexporter:
              dsn: tcp://clickhouse:9000/signoz_logs
              timeout: 10s
              use_new_schema: true
            # debug: {}
          service:
            telemetry:
              logs:
                encoding: json
              metrics:
                address: 0.0.0.0:8888
            extensions:
              - health_check
              - pprof
            pipelines:
              traces:
                receivers: [otlp]
                processors: [signozspanmetrics/delta, batch]
                exporters: [clickhousetraces]
              metrics:
                receivers: [otlp]
                processors: [batch]
                exporters: [clickhousemetricswrite, signozclickhousemetrics]
              metrics/prometheus:
                receivers: [prometheus]
                processors: [batch]
                exporters: [clickhousemetricswrite/prometheus, signozclickhousemetrics]
              logs:
                receivers: [otlp]
                processors: [batch]
                exporters: [clickhouselogsexporter]
      - type: bind
        source: ./otel-collector-opamp-config.yaml
        target: /etc/manager-config.yaml
        content: |
          server_endpoint: ws://signoz:4320/v1/opamp
    environment:
      - SERVICE_FQDN_OTELCOLLECTORGRPC_4317
      - SERVICE_FQDN_OTELCOLLECTORHTTP_4318
      - OTEL_RESOURCE_ATTRIBUTES=host.name=signoz-host,os.type=linux
      - LOW_CARDINAL_EXCEPTION_GROUPING=false
    healthcheck:
      test: bash -c "exec 6<> /dev/tcp/localhost/13133"
      interval: 30s
      timeout: 5s
      retries: 3

  schema-migrator-sync:
    image: signoz/signoz-schema-migrator:latest
    container_name: schema-migrator-sync
    command:
      - sync
      - --dsn=tcp://clickhouse:9000
      - --up=
    depends_on:
      clickhouse:
        condition: service_healthy
    restart: on-failure
    logging:
      options:
        max-size: 50m
        max-file: "3"

  schema-migrator-async:
    image: signoz/signoz-schema-migrator:latest
    container_name: schema-migrator-async
    depends_on:
      clickhouse:
        condition: service_healthy
      schema-migrator-sync:
        condition: service_completed_successfully
    logging:
      options:
        max-size: 50m
        max-file: "3"
    command:
      - async
      - --dsn=tcp://clickhouse:9000
      - --up=
    restart: on-failure