Initial commit
This commit is contained in:
96
playbooks/README-netdata.md
Normal file
96
playbooks/README-netdata.md
Normal file
@@ -0,0 +1,96 @@
|
||||
# Netdata Parent-Child Configuration
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
This playbook configures a centralized monitoring setup using Netdata's parent-child streaming:
|
||||
|
||||
- **Parent Node**: Fedora workstation (192.168.1.243) - Stores all metrics, provides dashboard
|
||||
- **Child Nodes**: All other hosts - Stream metrics to parent, minimal local storage
|
||||
|
||||
## Current Installation Status
|
||||
|
||||
Based on the latest check:
|
||||
|
||||
### ✅ Already Running
|
||||
- `lenovo420` - Fedora workstation (1.7GB memory usage)
|
||||
- `omv800` - OpenMediaVault server (561MB memory usage)
|
||||
- `lenovo` - Lenovo laptop (438MB memory usage)
|
||||
|
||||
### ❌ Need Installation
|
||||
- `surface` - Surface tablet
|
||||
- `fedora` - **Target parent node**
|
||||
- `omvbackup` - Backup server
|
||||
|
||||
### ⚠️ Unreachable
|
||||
- `audrey` - Connection timeout (may be offline)
|
||||
|
||||
## Resource Usage Benefits
|
||||
|
||||
**Before (current standalone instances):**
|
||||
- Total memory: ~2.7GB across 3 nodes
|
||||
- Each node stores full retention
|
||||
- Duplicated web interfaces
|
||||
|
||||
**After (parent-child setup):**
|
||||
- Parent: ~512MB (centralized storage + web UI)
|
||||
- Children: ~32MB each (streaming only)
|
||||
- Total memory: ~640MB (76% reduction)
|
||||
|
||||
## Running the Playbook
|
||||
|
||||
```bash
|
||||
# Configure all nodes
|
||||
ansible-playbook -i inventory.ini playbooks/netdata-parent-child.yml
|
||||
|
||||
# Configure only specific hosts
|
||||
ansible-playbook -i inventory.ini playbooks/netdata-parent-child.yml --limit fedora,surface
|
||||
|
||||
# Test connectivity after setup
|
||||
ansible-playbook -i inventory.ini playbooks/netdata-parent-child.yml --tags verify
|
||||
```
|
||||
|
||||
## Configuration Details
|
||||
|
||||
### Parent Node (fedora)
|
||||
- Binds to `*:19999` (accessible from network)
|
||||
- 2GB disk space for metrics storage
|
||||
- Receives streams from all children
|
||||
- Provides centralized dashboard
|
||||
|
||||
### Child Nodes (all others)
|
||||
- Memory mode: `none` (no local storage)
|
||||
- Binds to `localhost:19999` only
|
||||
- Streams all metrics to parent
|
||||
- Minimal resource footprint
|
||||
|
||||
## Accessing the Dashboard
|
||||
|
||||
After configuration:
|
||||
- **Central Dashboard**: http://192.168.1.243:19999
|
||||
- **Individual Node Access**: http://node-ip:19999 (localhost only on children)
|
||||
|
||||
## Security Features
|
||||
|
||||
- API key authentication for streaming
|
||||
- Network restrictions (home + Tailscale networks only)
|
||||
- Local-only access on child nodes
|
||||
- Firewall rules automatically configured
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Check stream status on parent:
|
||||
```bash
|
||||
curl http://192.168.1.243:19999/api/v1/info | jq '.hosts'
|
||||
```
|
||||
|
||||
### Check streaming on child:
|
||||
```bash
|
||||
systemctl status netdata
|
||||
journalctl -u netdata -f
|
||||
```
|
||||
|
||||
### Verify connectivity:
|
||||
```bash
|
||||
# From child node
|
||||
curl -I http://192.168.1.243:19999/api/v1/info
|
||||
```
|
||||
190
playbooks/extend-monitoring.yml
Normal file
190
playbooks/extend-monitoring.yml
Normal file
@@ -0,0 +1,190 @@
|
||||
---
|
||||
- name: Extend monitoring to non-Linux devices
|
||||
hosts: fedora # Run on parent node
|
||||
become: yes
|
||||
vars:
|
||||
network_devices:
|
||||
- name: "Gateway Router"
|
||||
ip: "192.168.50.1"
|
||||
type: "router"
|
||||
- name: "Immich Photos"
|
||||
ip: "192.168.50.66"
|
||||
type: "docker_host"
|
||||
- name: "Unknown Device 1"
|
||||
ip: "192.168.50.20"
|
||||
type: "unknown"
|
||||
- name: "Unknown Device 2"
|
||||
ip: "192.168.50.25"
|
||||
type: "unknown"
|
||||
|
||||
tasks:
|
||||
- name: Create network monitoring configuration
|
||||
copy:
|
||||
content: |
|
||||
# Network Device Monitoring Configuration
|
||||
# Add to /etc/netdata/go.d/ping.conf
|
||||
|
||||
jobs:
|
||||
- name: home_network_ping
|
||||
hosts:
|
||||
{% for device in network_devices %}
|
||||
- {{ device.ip }} # {{ device.name }}
|
||||
{% endfor %}
|
||||
interval: 30s
|
||||
count: 3
|
||||
timeout: 1s
|
||||
|
||||
- name: critical_services_ping
|
||||
hosts:
|
||||
- 192.168.50.1 # Gateway
|
||||
- 192.168.50.66 # Immich Photos
|
||||
- 8.8.8.8 # Google DNS
|
||||
- 1.1.1.1 # Cloudflare DNS
|
||||
interval: 10s
|
||||
count: 1
|
||||
timeout: 2s
|
||||
|
||||
dest: /etc/netdata/go.d/ping.conf
|
||||
|
||||
- name: Enable ping monitoring in Netdata
|
||||
lineinfile:
|
||||
path: /etc/netdata/netdata.conf
|
||||
regexp: '^.*\[plugin:go.d\]'
|
||||
line: '[plugin:go.d]'
|
||||
create: yes
|
||||
|
||||
- name: Configure ping plugin
|
||||
blockinfile:
|
||||
path: /etc/netdata/netdata.conf
|
||||
marker: "# {mark} PING MONITORING CONFIG"
|
||||
block: |
|
||||
[plugin:go.d]
|
||||
ping = yes
|
||||
|
||||
- name: Create Docker monitoring script for remote hosts
|
||||
copy:
|
||||
content: |
|
||||
#!/bin/bash
|
||||
# Monitor Docker containers on remote hosts
|
||||
|
||||
REMOTE_HOSTS=(
|
||||
"192.168.50.66:photos.local" # Immich Photos
|
||||
"100.78.26.112:omv800" # OMV with Docker
|
||||
"100.98.144.95:lenovo420" # Lenovo with Docker
|
||||
)
|
||||
|
||||
for host_info in "${REMOTE_HOSTS[@]}"; do
|
||||
IFS=':' read -r ip hostname <<< "$host_info"
|
||||
echo "=== Docker containers on $hostname ($ip) ==="
|
||||
|
||||
# Try to get container stats via SSH
|
||||
if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "jon@$ip" "docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Ports}}'" 2>/dev/null; then
|
||||
echo "SSH connection successful"
|
||||
else
|
||||
echo "SSH connection failed - checking if HTTP monitoring available"
|
||||
# Check for cAdvisor or similar
|
||||
if curl -s --connect-timeout 3 "http://$ip:8080/metrics" >/dev/null 2>&1; then
|
||||
echo "cAdvisor metrics available at http://$ip:8080"
|
||||
fi
|
||||
fi
|
||||
echo ""
|
||||
done
|
||||
dest: /usr/local/bin/check-remote-docker.sh
|
||||
mode: '0755'
|
||||
|
||||
- name: Add cron job for Docker monitoring
|
||||
cron:
|
||||
name: "Check remote Docker containers"
|
||||
minute: "*/5"
|
||||
job: "/usr/local/bin/check-remote-docker.sh >> /var/log/remote-docker-check.log 2>&1"
|
||||
|
||||
- name: Create Windows monitoring via WMI (if available)
|
||||
copy:
|
||||
content: |
|
||||
# Windows monitoring configuration
|
||||
# Requires wmic or PowerShell remoting to be enabled
|
||||
|
||||
[plugin:apps]
|
||||
# Disable apps monitoring to reduce load
|
||||
enabled = no
|
||||
|
||||
# Example Windows WMI monitoring (requires additional setup)
|
||||
# [plugin:wmi]
|
||||
# command = wmic -U domain/user%password //192.168.50.100 "SELECT * FROM Win32_Processor"
|
||||
# update every = 60
|
||||
|
||||
dest: /etc/netdata/windows-monitoring.conf
|
||||
|
||||
- name: Test ping monitoring
|
||||
shell: |
|
||||
/usr/libexec/netdata/plugins.d/go.d.plugin -m ping -d
|
||||
register: ping_test
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Display ping test results
|
||||
debug:
|
||||
var: ping_test.stdout_lines
|
||||
|
||||
- name: Restart Netdata to apply changes
|
||||
systemd:
|
||||
name: netdata
|
||||
state: restarted
|
||||
|
||||
- name: Create monitoring dashboard links
|
||||
copy:
|
||||
content: |
|
||||
# Extended Monitoring Dashboard Links
|
||||
|
||||
## Main Netdata Dashboard
|
||||
http://{{ ansible_host }}:19999
|
||||
|
||||
## Network Ping Monitoring
|
||||
http://{{ ansible_host }}:19999/#menu_ping_submenu_home_network_ping
|
||||
|
||||
## System Overview
|
||||
http://{{ ansible_host }}:19999/#menu_system_submenu_cpu
|
||||
|
||||
## All Connected Hosts
|
||||
{% for host in groups['all_linux'] %}
|
||||
- {{ host }}: http://{{ ansible_host }}:19999/host/{{ host }}
|
||||
{% endfor %}
|
||||
|
||||
## Discovered Network Devices (52 total)
|
||||
- Gateway: 192.168.50.1
|
||||
- Photos Server: 192.168.50.66
|
||||
- 50 other devices (IoT, mobile, Windows, etc.)
|
||||
|
||||
## Monitoring Capabilities by Device Type:
|
||||
|
||||
### ✅ Full Monitoring (Linux hosts with Netdata)
|
||||
- fedora (parent)
|
||||
- lenovo420, omv800, lenovo, surface, omvbackup (children)
|
||||
|
||||
### 📊 Limited Monitoring (ping, port checks)
|
||||
- Router/gateway
|
||||
- Docker hosts
|
||||
- Network appliances
|
||||
|
||||
### ⚠️ Manual Monitoring Required
|
||||
- Windows machines (need WMI/SNMP setup)
|
||||
- Mobile devices (battery optimization prevents agents)
|
||||
- IoT devices (resource constrained)
|
||||
|
||||
dest: /home/jonathan/monitoring-dashboard.md
|
||||
owner: jonathan
|
||||
group: jonathan
|
||||
|
||||
- name: Summary of monitoring coverage
|
||||
debug:
|
||||
msg: |
|
||||
Monitoring Coverage Summary:
|
||||
|
||||
✅ Linux hosts: 6/6 (100%) - Full system monitoring
|
||||
📊 Network devices: 52 discovered - Ping monitoring added
|
||||
🐳 Docker containers: Remote monitoring via SSH/API
|
||||
📱 Mobile/IoT: 46+ devices - Network connectivity only
|
||||
|
||||
Total network coverage: Ping monitoring for all 52 devices
|
||||
Detailed monitoring: 6 Linux systems (infrastructure core)
|
||||
|
||||
Dashboard: http://{{ ansible_host }}:19999
|
||||
13
playbooks/group_vars/all.yml
Normal file
13
playbooks/group_vars/all.yml
Normal file
@@ -0,0 +1,13 @@
|
||||
---
|
||||
# Netdata Configuration
|
||||
netdata_api_key: "homelab-stream-key-2024-secure"
|
||||
netdata_parent_retention_hours: 168 # 7 days
|
||||
netdata_child_retention_hours: 1 # 1 hour local buffer
|
||||
|
||||
# Monitoring Settings
|
||||
netdata_memory_limit_mb: 128 # Memory limit for child nodes
|
||||
netdata_disk_space_mb: 512 # Disk space limit for child nodes
|
||||
|
||||
# Network Configuration
|
||||
home_network_cidr: "192.168.1.0/24"
|
||||
tailscale_network_cidr: "100.64.0.0/10"
|
||||
198
playbooks/netdata-parent-child.yml
Normal file
198
playbooks/netdata-parent-child.yml
Normal file
@@ -0,0 +1,198 @@
|
||||
---
|
||||
- name: Configure Netdata Parent-Child Streaming Architecture
|
||||
hosts: all
|
||||
become: yes
|
||||
vars:
|
||||
netdata_parent_host: "192.168.50.225" # Fedora workstation wired IP (more stable)
|
||||
netdata_stream_api_key: "{{ netdata_api_key | default('homelab-stream-key-2024') }}"
|
||||
|
||||
tasks:
|
||||
- name: Install Netdata on hosts where it's missing
|
||||
block:
|
||||
- name: Install Netdata via package manager (Fedora/RHEL)
|
||||
dnf:
|
||||
name: netdata
|
||||
state: present
|
||||
when: ansible_os_family == "RedHat"
|
||||
|
||||
- name: Install Netdata via package manager (Debian/Ubuntu)
|
||||
apt:
|
||||
name: netdata
|
||||
state: present
|
||||
update_cache: yes
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Install Netdata via installer script (fallback)
|
||||
shell: |
|
||||
curl -L https://get.netdata.cloud/kickstart.sh | bash -s -- --stable-channel --disable-telemetry --dont-wait
|
||||
args:
|
||||
creates: /usr/sbin/netdata
|
||||
when: ansible_os_family not in ["RedHat", "Debian"]
|
||||
when: inventory_hostname in ['surface', 'fedora', 'omvbackup']
|
||||
|
||||
- name: Configure Netdata Parent (Central monitoring node)
|
||||
block:
|
||||
- name: Create Netdata parent configuration
|
||||
copy:
|
||||
content: |
|
||||
[global]
|
||||
# Parent node configuration
|
||||
memory mode = dbengine
|
||||
page cache size = 32
|
||||
dbengine disk space = 2048
|
||||
|
||||
# Network settings
|
||||
bind to = *:19999
|
||||
allow connections from = 192.168.50.0/24 100.64.0.0/10
|
||||
|
||||
# Disable local data collection plugins to reduce load
|
||||
[plugins]
|
||||
apps = no
|
||||
cgroups = no
|
||||
diskspace = no
|
||||
|
||||
[web]
|
||||
# Allow dashboard from any host on home network
|
||||
allow dashboard from = 192.168.50.0/24 100.64.0.0/10
|
||||
|
||||
# Stream configuration for receiving child data
|
||||
[stream]
|
||||
enabled = yes
|
||||
destination =
|
||||
api key = {{ netdata_stream_api_key }}
|
||||
timeout seconds = 60
|
||||
buffer size bytes = 1048576
|
||||
reconnect delay seconds = 5
|
||||
initial clock resync iterations = 60
|
||||
|
||||
# Accept streams from child nodes
|
||||
[{{ netdata_stream_api_key }}]
|
||||
enabled = yes
|
||||
allow from = 192.168.50.0/24 100.64.0.0/10
|
||||
default history = 3600
|
||||
default memory = dbengine
|
||||
health enabled by default = auto
|
||||
dest: /etc/netdata/netdata.conf
|
||||
backup: yes
|
||||
|
||||
- name: Enable and start Netdata parent
|
||||
systemd:
|
||||
name: netdata
|
||||
enabled: yes
|
||||
state: restarted
|
||||
|
||||
- name: Open firewall for Netdata (firewalld)
|
||||
firewalld:
|
||||
port: 19999/tcp
|
||||
permanent: yes
|
||||
state: enabled
|
||||
immediate: yes
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Open firewall for Netdata (ufw)
|
||||
ufw:
|
||||
rule: allow
|
||||
port: '19999'
|
||||
proto: tcp
|
||||
ignore_errors: yes
|
||||
|
||||
when: inventory_hostname == 'fedora'
|
||||
|
||||
- name: Configure Netdata Children (streaming to parent)
|
||||
block:
|
||||
- name: Configure Netdata child to stream to parent
|
||||
copy:
|
||||
content: |
|
||||
[global]
|
||||
# Child node - minimal local storage
|
||||
memory mode = none
|
||||
|
||||
# Network settings
|
||||
bind to = localhost:19999
|
||||
|
||||
[web]
|
||||
# Restrict dashboard access to localhost only
|
||||
allow dashboard from = localhost
|
||||
|
||||
# Stream all data to parent
|
||||
[stream]
|
||||
enabled = yes
|
||||
destination = {{ netdata_parent_host }}:19999
|
||||
api key = {{ netdata_stream_api_key }}
|
||||
timeout seconds = 60
|
||||
buffer size bytes = 1048576
|
||||
reconnect delay seconds = 5
|
||||
initial clock resync iterations = 60
|
||||
|
||||
# Send everything to parent
|
||||
send charts matching = *
|
||||
|
||||
# Reduce resource usage
|
||||
[plugins]
|
||||
# Keep essential monitoring but reduce frequency
|
||||
proc = yes
|
||||
diskspace = yes
|
||||
cgroups = yes
|
||||
apps = yes
|
||||
|
||||
# Optimize collection intervals
|
||||
[plugin:proc]
|
||||
/proc/net/dev = yes
|
||||
/proc/diskstats = yes
|
||||
/proc/net/sockstat = yes
|
||||
/proc/meminfo = yes
|
||||
/proc/vmstat = yes
|
||||
/proc/stat = yes
|
||||
/proc/loadavg = yes
|
||||
|
||||
dest: /etc/netdata/netdata.conf
|
||||
backup: yes
|
||||
|
||||
- name: Restart Netdata child
|
||||
systemd:
|
||||
name: netdata
|
||||
state: restarted
|
||||
|
||||
when: inventory_hostname != 'fedora'
|
||||
|
||||
- name: Verify Netdata service status
|
||||
systemd:
|
||||
name: netdata
|
||||
state: started
|
||||
enabled: yes
|
||||
register: netdata_status
|
||||
|
||||
- name: Display Netdata status
|
||||
debug:
|
||||
msg: |
|
||||
Netdata {{ 'parent' if inventory_hostname == 'fedora' else 'child' }} configured on {{ inventory_hostname }}
|
||||
Status: {{ netdata_status.status.ActiveState }}
|
||||
{% if inventory_hostname == 'fedora' %}
|
||||
Parent dashboard: http://{{ ansible_host }}:19999
|
||||
{% else %}
|
||||
Streaming to: {{ netdata_parent_host }}:19999
|
||||
{% endif %}
|
||||
|
||||
- name: Test connectivity to parent (from children)
|
||||
uri:
|
||||
url: "http://{{ netdata_parent_host }}:19999/api/v1/info"
|
||||
method: GET
|
||||
timeout: 10
|
||||
register: parent_test
|
||||
ignore_errors: yes
|
||||
when: inventory_hostname != 'fedora'
|
||||
|
||||
- name: Display connectivity test results
|
||||
debug:
|
||||
msg: |
|
||||
Connection to parent: {{ 'SUCCESS' if parent_test.status == 200 else 'FAILED' }}
|
||||
{% if parent_test.status != 200 %}
|
||||
Error: {{ parent_test.msg | default('Unknown error') }}
|
||||
{% endif %}
|
||||
when: inventory_hostname != 'fedora' and parent_test is defined
|
||||
|
||||
handlers:
|
||||
- name: restart netdata
|
||||
systemd:
|
||||
name: netdata
|
||||
state: restarted
|
||||
Reference in New Issue
Block a user