BmnRoot
Loading...
Searching...
No Matches
mon-service.py
Go to the documentation of this file.
1#! /usr/bin/python3
2
3import os
4import traceback
5import requests
6import sys
7import argparse
8import smtplib
9import json
10from timeit import default_timer as timer
11from time import sleep, time
12# Install: "pip install psycopg2-binary"
13import psycopg2
14# Install: "pip install influxdb"
15from influxdb import InfluxDBClient
16from requests.exceptions import ConnectionError
17
18# InfluxDB measurement name
19MEASUREMENT_NAME = "resp_time"
20
21# allowed failed checks
22allow_failed_check = 2
23
24def test_ping(ip):
25 # Number of pings is hardcoded, consider turning it into parameter if needed
26 resp = os.system("ping -c 5 {} > /dev/null".format(ip))
27 return {"success": resp == 0}
28
29
30def test_pgsql_connection(params):
31 t0 = timer()
32 connection = None
33 cursor = None
34
35 try:
36 # Create connection
37 # Note: timeout is hardcoded, consider turning it into parameter if needed
38 connection = psycopg2.connect(host=params["SERVER"], port=params["PORT"],
39 user=params["USER"], password=params["PASS"],
40 database=params["DBNAME"], connect_timeout=5)
41
42 cursor = connection.cursor()
43 # Print PostgreSQL Connection properties
44 # print ( connection.get_dsn_parameters(),"\n")
45
46 # Print PostgreSQL version
47 cursor.execute("SELECT version();")
48 record = cursor.fetchone()
49 # print("You are connected to - ", record,"\n")
50
51 except (Exception, psycopg2.Error) as error:
52 # print ("Error while connecting to PostgreSQL", error)
53 return {"success": False}
54
55 finally:
56 # closing database connection.
57 if cursor:
58 cursor.close()
59 if connection:
60 connection.close()
61
62 time_elapsed = timer() - t0
63 # print("PUTVAL pgsql_resp_time {}".format(time_elapsed))
64 return {"success": True, "latency": time_elapsed}
65
66def test_http(http_addr):
67 try:
68 result_request = requests.get(http_addr, timeout=5) # Getting status code from the web site
69 # if status code is less than 200 or more than 299 then show error message
70 if result_request.status_code < 200 and result_request.status_code > 299:
71 return {"success": False}
72 except requests.exceptions.RequestException as err: # ConnectionError, HTTPError, Timeout, TooManyRedirects
73 return {"success": False}
74 return {"success": True}
75
76def send_email(to, short_msg, long_msg=""):
77 try:
78 subj = email_subj_prefix + ": " + short_msg
79 msg = "From: %s\nTo: %s\nSubject: %s\n\n%s\n" % (config["MAIL"]["USER"], to, subj, long_msg)
80
81 mailserver = smtplib.SMTP(config["MAIL"]["SERVER"], config["MAIL"]["PORT"])
82 mailserver.starttls()
83 mailserver.login(config["MAIL"]["USER"], config["MAIL"]["PASS"])
84 # For multiple recipients, 'to' has to be a list here
85 problems = mailserver.sendmail(config["MAIL"]["USER"], to.split(","), msg)
86 mailserver.quit()
87 except Exception as e:
88 print("Error sending email - ignoring. Details: " + str(e))
89
90
91def get_notify(params_dict):
92 global config
93 if "NOTIFY" in params_dict:
94 return params_dict["NOTIFY"]
95 else:
96 return config["NOTIFY"]
97
98
99def main():
100 # Config loaded from JSON file
101 global config
102 # Line to use in email Subject
103 global email_subj_prefix
104
105 argparser = argparse.ArgumentParser()
106 argparser.add_argument("--config", help="Configuration file")
107 args = argparser.parse_args()
108 if args.config is None:
109 print("You must provide configuration file using --config")
110 sys.exit(1)
111 print("Reading --config file: " + args.config)
112
113 try:
114 with open(args.config) as conf_file:
115 config = json.loads(conf_file.read())
116 except Exception as e:
117 print("Error reading config file: " + str(e))
118 sys.exit(1)
119
120 # Use either configured service name or JSON file name (without extension) for emails
121 email_subj_prefix = config["NAME"] if "NAME" in config else os.path.splitext(os.path.basename(args.config))[0]
122
123 print("Starting " + email_subj_prefix)
124 send_email(config["LOG"], "Monitoring script started")
125
126 # Variables storing previous success/fail values
127 ping_failed = {}
128 pgsql_failed = {}
129 http_failed = {}
130 influxdb_prev_success = True
131 influxdb_initialized = False
132
133 for server in config["PING"]:
134 ping_failed[server] = 0
135 for server, server_params in config["DATABASE"].items():
136 if server_params["DBMS"] == "PGSQL":
137 pgsql_failed[server] = 0
138 else:
139 print("Unsupported DBMS type: " + server_params["DBMS"])
140 for server in config["WEB"]:
141 http_failed[server] = 0
142
143 try:
144 if "OUTPUT" in config and config["OUTPUT"]["DBMS"].upper() == "INFLUXDB":
145 if "USER" in config["OUTPUT"] and "PASS" in config["OUTPUT"]:
146 iclient = InfluxDBClient(host=config["OUTPUT"]["SERVER"], port=config["OUTPUT"]["PORT"],
147 username=config["OUTPUT"]["USER"], password=config["OUTPUT"]["PASS"])
148 else:
149 iclient = InfluxDBClient(host=config["OUTPUT"]["SERVER"], port=config["OUTPUT"]["PORT"])
150 else:
151 print("No configuration for InfluxDB output is found.")
152
153 while True:
154 # Do all PING tests
155 for test_name, test_params in config["PING"].items():
156 result_ping = test_ping(test_params["HOST"])
157 print("PING " + test_name + ": " + str(result_ping))
158 success = result_ping["success"]
159 if not success or ping_failed[test_name] > 0:
160 if success:
161 if ping_failed[test_name] > allow_failed_check:
162 send_email(get_notify(test_params), test_name + " - PING state changed to UP")
163 ping_failed[test_name] = 0
164 else:
165 if ping_failed[test_name] == allow_failed_check:
166 send_email(get_notify(test_params), test_name + " - PING state changed to *** DOWN ***")
167 ping_failed[test_name] += 1
168
169 # Do all PGSQL tests
170 for test_name, test_params in config["DATABASE"].items():
171 if test_params["DBMS"] != "PGSQL":
172 print("Unsupported DBMS type " + test_params["DBMS"])
173 continue
174 result_pgsql = test_pgsql_connection(test_params)
175 print("PGSQL " + test_name + ": " + str(result_pgsql))
176 success = result_pgsql["success"]
177 if not success or pgsql_failed[test_name] > 0:
178 if success:
179 if pgsql_failed[test_name] > allow_failed_check:
180 send_email(get_notify(test_params), test_name + " - PGSQL state changed to UP")
181 pgsql_failed[test_name] = 0
182 else:
183 if pgsql_failed[test_name] == allow_failed_check:
184 send_email(get_notify(test_params), test_name + " - PGSQL state changed to *** DOWN ***")
185 pgsql_failed[test_name] += 1
186
187 if "OUTPUT" in config and config["OUTPUT"]["DBMS"].upper() == "INFLUXDB":
188 # measurement,tag_set field_set timestamp
189 data = ["{measurement},test_name={test_name} resptime={resptime} {timestamp}"
190 .format(measurement=MEASUREMENT_NAME,
191 test_name=test_name,
192 resptime=result_pgsql.get("latency") or 0,
193 timestamp=int(time() * 1000))]
194 try:
195 if not influxdb_initialized:
196 # If database exists, this does nothing
197 iclient.create_database(config["OUTPUT"]["DBNAME"])
198 influxdb_initialized = True
199 iclient.write_points(data, database=config["OUTPUT"]["DBNAME"], time_precision='ms',
200 batch_size=10000, protocol='line')
201 if influxdb_prev_success == False:
202 influxdb_prev_success = True
203 send_email(get_notify(config["OUTPUT"]), "InfluxDB is reachable again.")
204 except (ConnectionRefusedError, ConnectionError) as err:
205 print("Error writing to InfluxDB - ignoring...")
206 if influxdb_prev_success == True:
207 influxdb_prev_success = False
208 send_email(get_notify(config["OUTPUT"]), "InfluxDB is unreachable!", str(err))
209 else:
210 print("Skip writing to InfluxDB due to no configuration.")
211
212 # Do all HTTP tests
213 for test_name, test_params in config["WEB"].items():
214 result_http = test_http(test_params["HTTP"])
215 print("Checking website for " + test_name + " (" + test_params["HTTP"] + "): " + str(result_http))
216 success = result_http["success"]
217 if not success or http_failed[test_name] > 0:
218 if success:
219 if http_failed[test_name] > allow_failed_check:
220 send_email(get_notify(config["OUTPUT"]), test_name + " - Web Interface state changed to UP")
221 http_failed[test_name] = 0
222 else:
223 if http_failed[test_name] == allow_failed_check:
224 send_email(get_notify(config["OUTPUT"]), test_name + " - Web Interface state changed to *** DOWN ***")
225 http_failed[test_name] += 1
226
227 sleep(config["INTERVAL_SEC"])
228
229 except Exception as e:
230 print("An exception has been occured with the following traceback info: ")
231 print(traceback.format_exc())
232 send_email(config["LOG"], "Monitoring script terminated!", traceback.format_exc())
233
234
235if __name__ == "__main__":
236 main()
int main()