Files

263 lines
13 KiB
YAML

# Required env vars (export before `docker compose up -d`):
# ALLOY_HOSTNAME, REMOTECFG_URL, REMOTECFG_ID, REMOTECFG_USER,
# PROM_URL, PROM_USER, LOKI_URL, LOKI_USER, GRAFANA_TOKEN
services:
alloy:
image: grafana/alloy:v1.16.1
container_name: alloy
restart: unless-stopped
hostname: ${ALLOY_HOSTNAME:?required}
privileged: true
network_mode: host # node_exporter netdev/netstat sees real host interfaces (eth0…) not veth
environment:
ALLOY_DEPLOY_MODE: docker
REMOTECFG_URL: ${REMOTECFG_URL:?required}
REMOTECFG_ID: ${REMOTECFG_ID:?required}
REMOTECFG_USER: ${REMOTECFG_USER:?required}
PROM_URL: ${PROM_URL:?required}
PROM_USER: ${PROM_USER:?required}
LOKI_URL: ${LOKI_URL:?required}
LOKI_USER: ${LOKI_USER:?required}
GRAFANA_TOKEN: ${GRAFANA_TOKEN:?required}
volumes:
- alloy-data:/var/lib/alloy/data
- /proc:/rootproc:ro
- /var/run/docker.sock:/var/run/docker.sock
- /sys:/sys:ro
- /:/rootfs:ro
- /dev/disk/:/dev/disk:ro
- /var/lib/docker/:/var/lib/docker:ro
- /var/log:/var/log:ro # journal + syslog/messages/*.log for the Linux-Node integration
- /etc/machine-id:/etc/machine-id:ro # stable host id for systemd journal reader
configs:
- { source: alloy_config, target: /etc/alloy/config.alloy }
command: 'run --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy'
volumes:
alloy-data:
configs:
alloy_config:
content: |
remotecfg {
url = sys.env("REMOTECFG_URL")
id = sys.env("REMOTECFG_ID")
poll_frequency = "60s"
basic_auth {
username = sys.env("REMOTECFG_USER")
password = sys.env("GRAFANA_TOKEN")
}
}
prometheus.remote_write "metrics_service" {
endpoint {
url = sys.env("PROM_URL")
basic_auth {
username = sys.env("PROM_USER")
password = sys.env("GRAFANA_TOKEN")
}
}
}
loki.write "grafana_cloud_loki" {
endpoint {
url = sys.env("LOKI_URL")
basic_auth {
username = sys.env("LOKI_USER")
password = sys.env("GRAFANA_TOKEN")
}
}
}
discovery.relabel "integrations_node_exporter" {
targets = prometheus.exporter.unix.integrations_node_exporter.targets
rule {
target_label = "instance"
replacement = constants.hostname
}
rule {
target_label = "job"
replacement = "integrations/node_exporter"
}
}
prometheus.exporter.unix "integrations_node_exporter" {
disable_collectors = ["ipvs", "btrfs", "infiniband", "xfs", "zfs"]
// Read host filesystems via bind mounts instead of the container namespace.
rootfs_path = "/rootfs"
procfs_path = "/rootproc"
filesystem {
fs_types_exclude = "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|tmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$"
mount_points_exclude = "^/(dev|proc|run/credentials/.+|sys|var/lib/docker/.+)($|/)"
mount_timeout = "5s"
}
netclass {
ignored_devices = "^(veth.*|cali.*|[a-f0-9]{15})$"
}
netdev {
device_exclude = "^(veth.*|cali.*|[a-f0-9]{15})$"
}
}
prometheus.scrape "integrations_node_exporter" {
targets = discovery.relabel.integrations_node_exporter.output
forward_to = [prometheus.relabel.integrations_node_exporter.receiver]
}
// Keep only the metrics enumerated under "Metrics" in the upstream
// Grafana Cloud Linux Node integration page (157 metrics, verbatim).
// The recording-rule output `instance:node_num_cpu:sum` is computed
// server-side by Grafana Cloud's ruler — not shipped from here.
prometheus.relabel "integrations_node_exporter" {
forward_to = [prometheus.remote_write.metrics_service.receiver]
rule {
source_labels = ["__name__"]
regex = "node_arp_entries|node_boot_time_seconds|node_context_switches_total|node_cpu_seconds_total|node_disk_io_time_seconds_total|node_disk_io_time_weighted_seconds_total|node_disk_read_bytes_total|node_disk_read_time_seconds_total|node_disk_reads_completed_total|node_disk_write_time_seconds_total|node_disk_writes_completed_total|node_disk_written_bytes_total|node_filefd_allocated|node_filefd_maximum|node_filesystem_avail_bytes|node_filesystem_device_error|node_filesystem_files|node_filesystem_files_free|node_filesystem_readonly|node_filesystem_size_bytes|node_intr_total|node_load1|node_load15|node_load5|node_md_disks|node_md_disks_required|node_memory_Active_anon_bytes|node_memory_Active_bytes|node_memory_Active_file_bytes|node_memory_AnonHugePages_bytes|node_memory_AnonPages_bytes|node_memory_Bounce_bytes|node_memory_Buffers_bytes|node_memory_Cached_bytes|node_memory_CommitLimit_bytes|node_memory_Committed_AS_bytes|node_memory_DirectMap1G_bytes|node_memory_DirectMap2M_bytes|node_memory_DirectMap4k_bytes|node_memory_Dirty_bytes|node_memory_HugePages_Free|node_memory_HugePages_Rsvd|node_memory_HugePages_Surp|node_memory_HugePages_Total|node_memory_Hugepagesize_bytes|node_memory_Inactive_anon_bytes|node_memory_Inactive_bytes|node_memory_Inactive_file_bytes|node_memory_Mapped_bytes|node_memory_MemAvailable_bytes|node_memory_MemFree_bytes|node_memory_MemTotal_bytes|node_memory_SReclaimable_bytes|node_memory_SUnreclaim_bytes|node_memory_ShmemHugePages_bytes|node_memory_ShmemPmdMapped_bytes|node_memory_Shmem_bytes|node_memory_Slab_bytes|node_memory_SwapTotal_bytes|node_memory_VmallocChunk_bytes|node_memory_VmallocTotal_bytes|node_memory_VmallocUsed_bytes|node_memory_WritebackTmp_bytes|node_memory_Writeback_bytes|node_netstat_Icmp6_InErrors|node_netstat_Icmp6_InMsgs|node_netstat_Icmp6_OutMsgs|node_netstat_Icmp_InErrors|node_netstat_Icmp_InMsgs|node_netstat_Icmp_OutMsgs|node_netstat_IpExt_InOctets|node_netstat_IpExt_OutOctets|node_netstat_TcpExt_ListenDrops|node_netstat_TcpExt_ListenOverflows|node_netstat_TcpExt_TCPSynRetrans|node_netstat_Tcp_InErrs|node_netstat_Tcp_InSegs|node_netstat_Tcp_OutRsts|node_netstat_Tcp_OutSegs|node_netstat_Tcp_RetransSegs|node_netstat_Udp6_InDatagrams|node_netstat_Udp6_InErrors|node_netstat_Udp6_NoPorts|node_netstat_Udp6_OutDatagrams|node_netstat_Udp6_RcvbufErrors|node_netstat_Udp6_SndbufErrors|node_netstat_UdpLite_InErrors|node_netstat_Udp_InDatagrams|node_netstat_Udp_InErrors|node_netstat_Udp_NoPorts|node_netstat_Udp_OutDatagrams|node_netstat_Udp_RcvbufErrors|node_netstat_Udp_SndbufErrors|node_network_carrier|node_network_info|node_network_mtu_bytes|node_network_receive_bytes_total|node_network_receive_compressed_total|node_network_receive_drop_total|node_network_receive_errs_total|node_network_receive_fifo_total|node_network_receive_multicast_total|node_network_receive_packets_total|node_network_speed_bytes|node_network_transmit_bytes_total|node_network_transmit_compressed_total|node_network_transmit_drop_total|node_network_transmit_errs_total|node_network_transmit_fifo_total|node_network_transmit_multicast_total|node_network_transmit_packets_total|node_network_transmit_queue_length|node_network_up|node_nf_conntrack_entries|node_nf_conntrack_entries_limit|node_os_info|node_procs_running|node_sockstat_FRAG6_inuse|node_sockstat_FRAG_inuse|node_sockstat_RAW6_inuse|node_sockstat_RAW_inuse|node_sockstat_TCP6_inuse|node_sockstat_TCP_alloc|node_sockstat_TCP_inuse|node_sockstat_TCP_mem|node_sockstat_TCP_mem_bytes|node_sockstat_TCP_orphan|node_sockstat_TCP_tw|node_sockstat_UDP6_inuse|node_sockstat_UDPLITE6_inuse|node_sockstat_UDPLITE_inuse|node_sockstat_UDP_inuse|node_sockstat_UDP_mem|node_sockstat_UDP_mem_bytes|node_sockstat_sockets_used|node_softnet_dropped_total|node_softnet_processed_total|node_softnet_times_squeezed_total|node_systemd_service_restart_total|node_systemd_unit_state|node_textfile_scrape_error|node_time_zone_offset_seconds|node_timex_estimated_error_seconds|node_timex_maxerror_seconds|node_timex_offset_seconds|node_timex_sync_status|node_uname_info|node_vmstat_oom_kill|node_vmstat_pgfault|node_vmstat_pgmajfault|node_vmstat_pgpgin|node_vmstat_pgpgout|node_vmstat_pswpin|node_vmstat_pswpout|process_max_fds|process_open_fds|up"
action = "keep"
}
}
loki.relabel "integrations_node_exporter" {
forward_to = [loki.write.grafana_cloud_loki.receiver]
rule {
target_label = "job"
replacement = "integrations/node_exporter"
}
rule {
target_label = "instance"
replacement = constants.hostname
}
}
journal_module "integrations_node_exporter" {
forward_to = [loki.relabel.integrations_node_exporter.receiver]
}
// File-based logs from the Linux-Node integration: syslog/messages/*.log.
// On systemd hosts, rsyslog often mirrors journald — see README "Caveats".
local.file_match "integrations_node_exporter_files" {
path_targets = [{
__address__ = "localhost",
__path__ = "/var/log/{syslog,messages,*.log}",
}]
}
loki.source.file "integrations_node_exporter_files" {
targets = local.file_match.integrations_node_exporter_files.targets
forward_to = [loki.relabel.integrations_node_exporter.receiver]
}
//JOURNAL
declare "journal_module" {
argument "forward_to" {
optional = false
}
loki.source.journal "default" {
max_age = "12h0m0s"
forward_to = [loki.process.default.receiver]
relabel_rules = loki.relabel.default.rules
}
loki.relabel "default" {
rule {
source_labels = ["__journal__systemd_unit"]
target_label = "unit"
}
rule {
source_labels = ["__journal__boot_id"]
target_label = "boot_id"
}
rule {
source_labels = ["__journal__transport"]
target_label = "transport"
}
rule {
source_labels = ["__journal_priority_keyword"]
target_label = "level"
}
forward_to = []
}
loki.process "default" {
forward_to = argument.forward_to.value
}
}
prometheus.exporter.cadvisor "integrations_cadvisor" {
docker_only = true
}
discovery.relabel "integrations_cadvisor" {
targets = prometheus.exporter.cadvisor.integrations_cadvisor.targets
rule {
target_label = "job"
replacement = "integrations/docker"
}
rule {
target_label = "instance"
replacement = constants.hostname
}
}
prometheus.relabel "integrations_cadvisor" {
forward_to = [prometheus.remote_write.metrics_service.receiver]
rule {
source_labels = ["__name__"]
regex = "container_cpu_usage_seconds_total|container_fs_reads_total|container_fs_usage_bytes|container_fs_writes_total|container_last_seen|container_memory_cache|container_memory_rss|container_memory_usage_bytes|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_errors_total|container_network_receive_packets_dropped_total|container_network_transmit_bytes_total|container_network_transmit_errors_total|container_network_transmit_packets_dropped_total|container_spec_memory_limit_bytes|container_spec_memory_reservation_limit_bytes|machine_memory_bytes|machine_scrape_error|up"
action = "keep"
}
}
prometheus.scrape "integrations_cadvisor" {
targets = discovery.relabel.integrations_cadvisor.output
forward_to = [prometheus.relabel.integrations_cadvisor.receiver]
}
discovery.docker "logs_integrations_docker" {
host = "unix:///var/run/docker.sock"
refresh_interval = "5s"
}
discovery.relabel "logs_integrations_docker" {
targets = []
rule {
target_label = "job"
replacement = "integrations/docker"
}
rule {
target_label = "instance"
replacement = constants.hostname
}
rule {
source_labels = ["__meta_docker_container_name"]
regex = "/(.*)"
target_label = "container"
}
rule {
source_labels = ["__meta_docker_container_log_stream"]
target_label = "stream"
}
}
loki.source.docker "logs_integrations_docker" {
host = "unix:///var/run/docker.sock"
targets = discovery.docker.logs_integrations_docker.targets
forward_to = [loki.write.grafana_cloud_loki.receiver]
relabel_rules = discovery.relabel.logs_integrations_docker.rules
refresh_interval = "5s"
}