Files
coolify/templates/compose/signoz.yaml
T
2025-06-18 18:36:40 +02:00

468 lines
17 KiB
YAML

# documentation: https://signoz.io/docs/introduction/
# slogan: An observability platform native to OpenTelemetry with logs, traces and metrics.
# tags: telemetry, server, applications, interface, logs, monitoring, traces, metrics
# logo: svgs/signoz.svg
# port: 8080
services:
init-clickhouse:
image: clickhouse/clickhouse-server:24.1.2-alpine
container_name: signoz-init-clickhouse
command:
- bash
- -c
- |
version="v0.0.1"
node_os=$$(uname -s | tr '[:upper:]' '[:lower:]')
node_arch=$$(uname -m | sed s/aarch64/arm64/ | sed s/x86_64/amd64/)
echo "Fetching histogram-binary for $${node_os}/$${node_arch}"
cd /tmp
wget -O histogram-quantile.tar.gz "https://github.com/SigNoz/signoz/releases/download/histogram-quantile%2F$${version}/histogram-quantile_$${node_os}_$${node_arch}.tar.gz"
tar -xvzf histogram-quantile.tar.gz
mkdir -p /var/lib/clickhouse/user_scripts/histogramQuantile
mv histogram-quantile /var/lib/clickhouse/user_scripts/histogramQuantile
restart: on-failure
logging:
options:
max-size: 50m
max-file: "3"
zookeeper-1:
image: bitnami/zookeeper:3.7.1
container_name: signoz-zookeeper-1
user: root
healthcheck:
test:
- CMD-SHELL
- curl -s -m 2 http://localhost:8080/commands/ruok | grep error | grep null
interval: 30s
timeout: 5s
retries: 3
restart: unless-stopped
logging:
options:
max-size: 50m
max-file: "3"
volumes:
- zookeeper-1:/bitnami/zookeeper
environment:
- ZOO_SERVER_ID=1
- ALLOW_ANONYMOUS_LOGIN=yes
- ZOO_AUTOPURGE_INTERVAL=1
- ZOO_ENABLE_PROMETHEUS_METRICS=yes
- ZOO_PROMETHEUS_METRICS_PORT_NUMBER=9141
clickhouse:
# addding non LTS version due to this fix https://github.com/ClickHouse/ClickHouse/commit/32caf8716352f45c1b617274c7508c86b7d1afab
image: clickhouse/clickhouse-server:24.1.2-alpine
container_name: signoz-clickhouse
tty: true
depends_on:
init-clickhouse:
condition: service_completed_successfully
zookeeper-1:
condition: service_healthy
healthcheck:
test:
- CMD
- wget
- --spider
- -q
- 0.0.0.0:8123/ping
interval: 30s
timeout: 5s
retries: 3
ulimits:
nproc: 65535
nofile:
soft: 262144
hard: 262144
restart: unless-stopped
logging:
options:
max-size: 50m
max-file: "3"
volumes:
- type: bind
source: ./clickhouse/config.d/config.xml
target: /etc/clickhouse-server/config.d/config.xml
content: |
<clickhouse>
<logger>
<level>information</level>
<formatting>
<type>json</type>
</formatting>
</logger>
<macros>
<shard>01</shard>
<replica>example01-01-1</replica>
</macros>
<user_defined_executable_functions_config>*function.xml</user_defined_executable_functions_config>
</clickhouse>
- type: bind
source: ./clickhouse/custom-function.xml
target: /etc/clickhouse-server/custom-function.xml
content: |
<functions>
<function>
<type>executable</type>
<name>histogramQuantile</name>
<return_type>Float64</return_type>
<argument>
<type>Array(Float64)</type>
<name>buckets</name>
</argument>
<argument>
<type>Array(Float64)</type>
<name>counts</name>
</argument>
<argument>
<type>Float64</type>
<name>quantile</name>
</argument>
<format>CSV</format>
<command>./histogramQuantile</command>
</function>
</functions>
- type: bind
source: ./clickhouse/cluster.xml
target: /etc/clickhouse-server/config.d/cluster.xml
content: |
<?xml version="1.0"?>
<clickhouse>
<!-- ZooKeeper is used to store metadata about replicas, when using Replicated tables.
Optional. If you don't use replicated tables, you could omit that.
See https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/replication/
-->
<zookeeper>
<node index="1">
<host>zookeeper-1</host>
<port>2181</port>
</node>
<!-- <node index="2">
<host>zookeeper-2</host>
<port>2181</port>
</node>
<node index="3">
<host>zookeeper-3</host>
<port>2181</port>
</node> -->
</zookeeper>
<!-- Configuration of clusters that could be used in Distributed tables.
https://clickhouse.com/docs/en/operations/table_engines/distributed/
-->
<remote_servers>
<cluster>
<!-- Inter-server per-cluster secret for Distributed queries
default: no secret (no authentication will be performed)
If set, then Distributed queries will be validated on shards, so at least:
- such cluster should exist on the shard,
- such cluster should have the same secret.
And also (and which is more important), the initial_user will
be used as current user for the query.
Right now the protocol is pretty simple and it only takes into account:
- cluster name
- query
Also it will be nice if the following will be implemented:
- source hostname (see interserver_http_host), but then it will depends from DNS,
it can use IP address instead, but then the you need to get correct on the initiator node.
- target hostname / ip address (same notes as for source hostname)
- time-based security tokens
-->
<!-- <secret></secret> -->
<shard>
<!-- Optional. Whether to write data to just one of the replicas. Default: false (write data to all replicas). -->
<!-- <internal_replication>false</internal_replication> -->
<!-- Optional. Shard weight when writing data. Default: 1. -->
<!-- <weight>1</weight> -->
<replica>
<host>clickhouse</host>
<port>9000</port>
<!-- Optional. Priority of the replica for load_balancing. Default: 1 (less value has more priority). -->
<!-- <priority>1</priority> -->
</replica>
</shard>
<!-- <shard>
<replica>
<host>clickhouse-2</host>
<port>9000</port>
</replica>
</shard>
<shard>
<replica>
<host>clickhouse-3</host>
<port>9000</port>
</replica>
</shard> -->
</cluster>
</remote_servers>
</clickhouse>
- type: volume
source: clickhouse
target: /var/lib/clickhouse/
signoz:
image: signoz/signoz:latest
container_name: signoz
depends_on:
clickhouse:
condition: service_healthy
schema-migrator-sync:
condition: service_completed_successfully
restart: unless-stopped
logging:
options:
max-size: 50m
max-file: "3"
command:
- --config=/root/config/prometheus.yml
volumes:
- type: bind
source: ./prometheus.yml
target: /root/config/prometheus.yml
content: |
# my global config
global:
scrape_interval: 5s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files: []
# - "first_rules.yml"
# - "second_rules.yml"
# - 'alerts.yml'
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs: []
remote_read:
- url: tcp://clickhouse:9000/signoz_metrics
- type: volume
source: sqlite
target: /var/lib/signoz/
environment:
- SERVICE_FQDN_SIGNOZ_8080
- SIGNOZ_ALERTMANAGER_PROVIDER=signoz
- SIGNOZ_TELEMETRYSTORE_CLICKHOUSE_DSN=tcp://clickhouse:9000
- SIGNOZ_SQLSTORE_SQLITE_PATH=/var/lib/signoz/signoz.db
- DASHBOARDS_PATH=/root/config/dashboards
- STORAGE=clickhouse
- GODEBUG=netdns=go
- DEPLOYMENT_TYPE=docker-standalone-amd
- TELEMETRY_ENABLED=${TELEMETRY_ENABLED:-true}
- SMTP_ENABLED=${SMTP_ENABLED:-false}
- SMTP_FROM=${SMTP_FROM}
- SMTP_HOST=${SMTP_HOST}
- SMTP_PORT=${SMTP_PORT}
- SMTP_USERNAME=${SMTP_USERNAME}
- SMTP_PASSWORD=${SMTP_PASSWORD}
- SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__AUTH__PASSWORD=${SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__AUTH__PASSWORD}
- SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__AUTH__USERNAME=${SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__AUTH__USERNAME}
- SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__FROM=${SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__FROM}
- SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__SMARTHOST=${SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__SMARTHOST}
healthcheck:
test:
- CMD
- wget
- --spider
- -q
- localhost:8080/api/v1/health
interval: 30s
timeout: 5s
retries: 3
otel-collector:
image: signoz/signoz-otel-collector:latest
container_name: signoz-otel-collector
depends_on:
clickhouse:
condition: service_healthy
schema-migrator-sync:
condition: service_completed_successfully
signoz:
condition: service_healthy
restart: unless-stopped
logging:
options:
max-size: 50m
max-file: "3"
ports:
- "4317:4317" # OTLP gRPC receiver
- "4318:4318" # OTLP HTTP receiver
command:
- --config=/etc/otel-collector-config.yaml
- --manager-config=/etc/manager-config.yaml
- --copy-path=/var/tmp/collector-config.yaml
- --feature-gates=-pkg.translator.prometheus.NormalizeName
volumes:
- type: bind
source: ./otel-collector-config.yaml
target: /etc/otel-collector-config.yaml
content: |
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
prometheus:
config:
global:
scrape_interval: 60s
scrape_configs:
- job_name: otel-collector
static_configs:
- targets:
- localhost:8888
labels:
job_name: otel-collector
processors:
batch:
send_batch_size: 10000
send_batch_max_size: 11000
timeout: 10s
resourcedetection:
# Using OTEL_RESOURCE_ATTRIBUTES envvar, env detector adds custom labels.
detectors: [env, system]
timeout: 2s
signozspanmetrics/delta:
metrics_exporter: clickhousemetricswrite
metrics_flush_interval: 60s
latency_histogram_buckets: [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s ]
dimensions_cache_size: 100000
aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA
enable_exp_histogram: true
dimensions:
- name: service.namespace
default: default
- name: deployment.environment
default: default
# This is added to ensure the uniqueness of the timeseries
# Otherwise, identical timeseries produced by multiple replicas of
# collectors result in incorrect APM metrics
- name: signoz.collector.id
- name: service.version
- name: browser.platform
- name: browser.mobile
- name: k8s.cluster.name
- name: k8s.node.name
- name: k8s.namespace.name
- name: host.name
- name: host.type
- name: container.name
extensions:
health_check:
endpoint: 0.0.0.0:13133
pprof:
endpoint: 0.0.0.0:1777
exporters:
clickhousetraces:
datasource: tcp://clickhouse:9000/signoz_traces
low_cardinal_exception_grouping: ${env:LOW_CARDINAL_EXCEPTION_GROUPING}
use_new_schema: true
clickhousemetricswrite:
endpoint: tcp://clickhouse:9000/signoz_metrics
resource_to_telemetry_conversion:
enabled: true
clickhousemetricswrite/prometheus:
endpoint: tcp://clickhouse:9000/signoz_metrics
signozclickhousemetrics:
dsn: tcp://clickhouse:9000/signoz_metrics
clickhouselogsexporter:
dsn: tcp://clickhouse:9000/signoz_logs
timeout: 10s
use_new_schema: true
# debug: {}
service:
telemetry:
logs:
encoding: json
metrics:
address: 0.0.0.0:8888
extensions:
- health_check
- pprof
pipelines:
traces:
receivers: [otlp]
processors: [signozspanmetrics/delta, batch]
exporters: [clickhousetraces]
metrics:
receivers: [otlp]
processors: [batch]
exporters: [clickhousemetricswrite, signozclickhousemetrics]
metrics/prometheus:
receivers: [prometheus]
processors: [batch]
exporters: [clickhousemetricswrite/prometheus, signozclickhousemetrics]
logs:
receivers: [otlp]
processors: [batch]
exporters: [clickhouselogsexporter]
- type: bind
source: ./otel-collector-opamp-config.yaml
target: /etc/manager-config.yaml
content: |
server_endpoint: ws://signoz:4320/v1/opamp
environment:
- SERVICE_FQDN_OTELCOLLECTORGRPC_4317
- SERVICE_FQDN_OTELCOLLECTORHTTP_4318
- OTEL_RESOURCE_ATTRIBUTES=host.name=signoz-host,os.type=linux
- LOW_CARDINAL_EXCEPTION_GROUPING=false
healthcheck:
test: bash -c "exec 6<> /dev/tcp/localhost/13133"
interval: 30s
timeout: 5s
retries: 3
schema-migrator-sync:
image: signoz/signoz-schema-migrator:latest
container_name: schema-migrator-sync
command:
- sync
- --dsn=tcp://clickhouse:9000
- --up=
depends_on:
clickhouse:
condition: service_healthy
restart: on-failure
logging:
options:
max-size: 50m
max-file: "3"
schema-migrator-async:
image: signoz/signoz-schema-migrator:latest
container_name: schema-migrator-async
depends_on:
clickhouse:
condition: service_healthy
schema-migrator-sync:
condition: service_completed_successfully
logging:
options:
max-size: 50m
max-file: "3"
command:
- async
- --dsn=tcp://clickhouse:9000
- --up=
restart: on-failure