Nagios

Nagios is a powerful monitoring system that enables organizations to identify and resolve IT infrastructure problems before they affect critical business processes.

Check nagios configuration

   1 /usr/local/nagios/bin/nagios -v  /usr/local/nagios/etc/nagios.cfg
   2 # status service on CentOS
   3 service nagios status
   4 # restart service on CentOS
   5 service nagios restart

Install nagios client NRPE on CentOS 5.X

   1 cd /tmp
   2 wget dl.fedoraproject.org/pub/epel/5/i386/epel-release-5-4.noarch.rpm
   3 rpm -ivh epel-release-5-4.noarch.rpm
   4 yum install -y nrpe nagios-plugins-all openssl
   5 cp /etc/nagios/nrpe.cfg /etc/nagios/nrpe.cfg.back
   6 # Edit /etc/nagios/nrpe.cfg and add to allowed_hosts the IP address of the Nagios server
   7 service nrpe status
   8 service nrpe start
   9 chkconfig nrpe on
  10 cat /etc/xinetd.d/nrpe # check if exists to see where the config is
  11 

Install nagios client NRPE on CentOS 6.X

   1 cd /tmp
   2 wget dl.fedoraproject.org/pub/epel/6/i386/epel-release-6-8.noarch.rpm
   3 rpm -ivh epel-release-6-8.noarch.rpm
   4 yum install -y nrpe nagios-plugins-all openssl
   5 cp /etc/nagios/nrpe.cfg /etc/nagios/nrpe.cfg.back
   6 # Edit /etc/nagios/nrpe.cfg and add to allowed_hosts the IP address of the Nagios server
   7 service nrpe status
   8 service nrpe start
   9 chkconfig nrpe on
  10 cat /etc/xinetd.d/nrpe # check if exists to see where the config is
  11 

Sample service config

define service{
    use generic-service
    service_description Secure shell
    check_command check_ssh
}

define service{
    use generic-service
    service_description Web server
    check_command check_http
}

define service{
    use generic-service
    service_description RDP Windows
    check_command check_tcp!3389
}

Check command

   1 /usr/local/nagios/libexec/check_nrpe -H 192.168.1.1 -c check_metric

Check http with hostname, url and content

Plugin description: https://www.nagios-plugins.org/doc/man/check_http.html Search for commands.cfg and define a new command to handle the hostname string and url.

define command{
  command_name    check_http_content
  command_line    $USER1$/check_http  -w 5 -c 10 -H $ARG1$ -s "$ARG2$" -u $ARG3$
}

define service{
  use generic-service
  host_name SERVERXYZ
  service_description HTTP www.example.net/urlx/
  check_command check_http_content!www.example.net!strInHtml!/urlx/
}

./check_http -w 5 -c 10 -H www.sapo.pt  -s "teste" -u /
HTTP OK: HTTP/1.1 200 OK - 391565 bytes in 0.376 second response time |time=0.376469s;5.000000;10.000000;0.000000 size=391565B;;;0

./check_http -w 5 -c 10 -H www.sapo.pt  -s "testesss" -u /
HTTP CRITICAL: HTTP/1.1 200 OK - string not found - 391720 bytes in 0.386 second response time |time=0.385639s;5.000000;10.000000;0.000000 size=391720B;;;0

Sample /etc/nagios/nrpe.cfg config file on CentOS 6.3 64 bit, host checked by Nagios

log_facility=daemon
pid_file=/var/run/nrpe/nrpe.pid
server_port=5666
nrpe_user=nrpe
nrpe_group=nrpe
#nagios server 192.168.1.2
allowed_hosts=196.168.1.2,127.0.0.1
dont_blame_nrpe=0
debug=1
command_timeout=60
connection_timeout=300
include_dir=/etc/nrpe.d/

command[check_users]=/usr/lib64/nagios/plugins/check_users -w 5 -c 10
command[check_load]=/usr/lib64/nagios/plugins/check_load -w 15,10,5 -c 30,25,20
command[check_hda1]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 10% -p /dev/hda1
command[check_zombie_procs]=/usr/lib64/nagios/plugins/check_procs -w 5 -c 10 -s Z
command[check_total_procs]=/usr/lib64/nagios/plugins/check_procs -w 150 -c 200 

check_oracle_health shows warning when OK

http://forums.meulie.net/viewtopic.php?f=62&t=6282

Nagios shows Warning when is OK

Run the command locally on the target server with the user nrpe. Give permission to login to the user.

   1 su nrpe
   2 mycommand.py
   3 echo $?

If there are problems of permissions or similar when running with the user nrpe, solve them testing the commands under the user nrpe.

If the command creates or uses files, delete them if the command tested with root, so they are created properly with the right permissions.

NRPE: Unable to read output (CentOS 6.3)

Make sure the following is setted:

   1 setenforce 0
   2 nano /etc/sysconfig/selinux # change to SELINUX=disabled
   3 

Change number processes on check_total_procs command

If we are getting too many warning mesages regardig the total number of processes do the following:

Sample plugin in Python

Deploy plugin in Ubuntu NRPE client

   1 #!/usr/bin/python
   2 import sys
   3 import datetime
   4 
   5 nowx = datetime.datetime.now()
   6 filename= '/home/userx/app/%02d%02d%02d_x.log'%(nowx.year,nowx.month,nowx.day)
   7 filex=open(filename)
   8 
   9 acmes={}
  10 
  11 for line in filex:
  12     if 'ACME did' in line:
  13         splitted = line.split(' ')
  14         acmeName=splitted[2]
  15         if acmeName in acmes:
  16             acmes[acmeName]=int(acmes[acmeName])+1
  17         else:
  18             acmes[acmeName]=1
  19 
  20 filex.close()
  21 
  22 print 'OK - Nr acme: %d|nrAcme=%d;;;;'%( len(acmes) , len(acmes) )
  23 res=0 # OK-0 WARNING-1 CRITICAL-2 UNKNOWN-3
  24 sys.exit(res) 

Nagios plugin configuration on nagios server

Service definition:

define service{
 use generic-service
 host_name ACMESRV
 service_description Acmes
 check_command check_nrpe!check_acme
 contacts abc.xyz.responsible
}

Edit map file for Nagiosgraph

# Number Acme
/perfdata:.*nrAcme=([\d]+);;;;/
and push @s, ['Number Acme',
               ['NrAcme', GAUGE, $1 ]
             ];

Check TCP port plugin

   1 #!/usr/bin/env python
   2 import socket
   3 import sys
   4 import time
   5 
   6 host = sys.argv[1] 
   7 port = int(sys.argv[2])
   8 
   9 retValue=0 # OK
  10 start = time.mktime(time.gmtime())
  11 timeout=5
  12 delta=timeout
  13 try:
  14     s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
  15     s.settimeout(timeout)
  16     s.connect((host, port))
  17     s.close()
  18     end =  time.mktime(time.gmtime())
  19     delta=end-start
  20 except Exception,ex:
  21     retValue=2 # CRITICAL
  22     delta=timeout
  23 
  24 if retValue==0:
  25     print 'OK - Connection time:%d|connTime=%d;;;;'%(delta,delta)
  26 if retValue==2:
  27     print 'ERROR - Connection time:%d|connTime=%d;;;;'%(delta,delta)
  28 
  29 sys.exit(retValue)

Nagios (last edited 2023-05-29 10:27:23 by 127)