Advertisement
dachte

compumancer

Feb 11th, 2021
857
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.96 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import argparse
  4. import code # for code.interact(local=dict(globals, **locals()) ) debugging
  5. import datetime
  6. import json
  7. import os
  8. import requests
  9. import psutil
  10. import sys
  11. import time
  12. from typing import Dict, List, Tuple
  13.  
  14. ######################
  15. # compumancer
  16. #
  17. # A very small tool meant to be run under cron, so it will email somebody
  18. # if something is up. This will return a nonzero response code if any defined
  19. # tests are not entirely okay. Meant to be run with chronic or some other wrapper.
  20. #
  21. # This is meant to be useful if you need cron-based monitoring because you're working somewhere where
  22. # nagios is impractical. This is not meant as a replacement for good monitoring, just a stopgap.
  23. #
  24. # This uses conda, because Ubuntu's python doesn't come with psutil.
  25.  
  26. def main():
  27.     cfg, jconf = handle_args()
  28.     anger = False
  29.     all_msgs = []
  30.  
  31.     fail, msgs = trial_of_space(jconf, cfg.fatal_only)
  32.     if fail:
  33.         anger = True
  34.         all_msgs += msgs
  35.  
  36.     fail, msgs = trial_of_attention(jconf, cfg.fatal_only)
  37.     if fail:
  38.         anger = True
  39.         all_msgs += msgs
  40.  
  41.     fail, msgs = trial_of_whispers(jconf, cfg.fatal_only)
  42.     if fail:
  43.         anger = True
  44.         all_msgs += msgs
  45.  
  46.     fail, msgs = trial_of_calligraphy(jconf, cfg.fatal_only)
  47.     if fail:
  48.         anger = True
  49.         all_msgs += msgs
  50.  
  51.     fail, msgs = trial_of_the_burro(jconf, cfg.fatal_only)
  52.     if fail:
  53.         anger = True
  54.         all_msgs += msgs
  55.  
  56.     fail, msgs = trial_of_loneliness(jconf, cfg.fatal_only)
  57.     if fail:
  58.         anger = True
  59.         all_msgs += msgs
  60.     if anger:
  61.         print("Compumancer is angry. Reasons:")
  62.         print("\n".join(map(lambda x: "\t" + x, all_msgs)))
  63.         sys.exit(1)
  64.     else:
  65.         print("Compumancer is happy")
  66.  
  67. def handle_args():
  68.     # Unlike my normal handle_args(), this loads a json configuration from a file.
  69.     parser = argparse.ArgumentParser(description="Mini monitoring")
  70.     parser.add_argument("--fatal_only", action="store_true", help="Be noisier")
  71.     ret = parser.parse_args()
  72.     config_fn = 'compumancer.cfg'
  73.     if not os.path.exists(config_fn):
  74.         raise Exception('JSON Config file {config_fn} does not exist!')
  75.     with open(config_fn, 'r') as jconf:
  76.         jret = json.load(jconf)
  77.     return ret, jret
  78.  
  79. ###########
  80. # trials
  81. #
  82. # Trials should take a dict and a boolean (even if they ignore either/both).
  83. # Trials should not take extra optional parameters, and must not require additional parameters.
  84. # They should not refer to globals for machine-specific configuration
  85. # Trials must return a tuple of a boolean (whether the trial failed, where grounds for failure *may* be loosened by the parameter)
  86. #   and a list of strings providing useful diagnostic information.
  87. # Trials *should not* write to stdout.
  88. # Trials *must* leave their list return value empty if their boolean is False.
  89. # Trials *should* be written with a sense of humour in their variable names.
  90.  
  91. def trial_of_space(jconf:Dict, fatal_only:bool=False) -> Tuple[bool, List[str]]:
  92.     # Is there enough free space on filesystems?
  93.     # Threshold for warnings is 80%
  94.     # Threshold for errors is 90%
  95.     # psutil.disk_usage('path') returns (total,used,free,percentage)
  96.     anger = False
  97.     if fatal_only:
  98.         thresh = 90
  99.     else:
  100.         thresh = 80
  101.  
  102.     ret = []
  103.     for fs in jconf['filesystems']:
  104.         if not os.path.exists(fs):
  105.             anger = True
  106.             ret.append(f"Filesystem {fs} does not exist")
  107.             continue
  108.         total, used, free, percent = psutil.disk_usage(fs)
  109.         if percent > thresh:
  110.             anger = True
  111.             ret.append(f"Filesystem {fs} is {percent} full, over {thresh} threshold")
  112.     return anger, ret
  113.  
  114. def trial_of_attention(jconf:Dict, fatal_only:bool=False) -> Tuple[bool, List[str]]:
  115.     # Is the system load acceptable?
  116.     # Threshold for warnings is 10
  117.     # Threshold for errors is 30
  118.     # use os.getloadavg()
  119.     anger = False
  120.     ret = []
  121.     if fatal_only:
  122.         thresh = 30
  123.     else:
  124.         thresh = 10
  125.  
  126.     loadavg = os.getloadavg()[1] # returns 3 figures. Picking the five-minute average just because
  127.     if loadavg > thresh:
  128.         anger = True
  129.         ret.append(f"Load average {loadavg} is over threshold {thresh}")
  130.     return anger, ret
  131.  
  132. def trial_of_whispers(jconf:Dict, fatal_only:bool=False) -> Tuple[bool, List[str]]:
  133.     # Visit certain URLs. Meant to verify that
  134.     # some web service is running (and, if SSL, that its certs work)
  135.     # use the requests library
  136.     anger = False
  137.     ret = []
  138.     for url in jconf['urls']:
  139.         req = requests.get(url)
  140.         status = req.status_code
  141.         if status != 200:
  142.             anger = True
  143.             r.append(f'Got a {status} when visiting {url}')
  144.     return anger, ret
  145.  
  146. def trial_of_calligraphy(jconf:Dict, fatal_only:bool=False) -> Tuple[bool, List[str]]:
  147.     # See if there's any mention of the oom killer in dmesg. Try to just look at recent events
  148.     # so rebooting isn't needed to recover. Note that we're a little bit lazy and only read one file
  149.     # to look for this.
  150.     anger = False
  151.     ret = []
  152.     if not os.path.isfile(jconf['logfile']):
  153.         anger = True
  154.         ret.append(f"Could not open logfile {jconf['logfile']}")
  155.         return anger, ret
  156.     with open(jconf['logfile']) as syslog:
  157.         for sline in syslog:
  158.             if 'killed process' in sline:
  159.                 mon,day,time = sline.split(' ', 3)
  160.                 # Next we make a good-faith effort to get the time the oom killer ran.
  161.                 # Note that because the file we're consulting lacks a year string, we assume that the logs are from the
  162.                 # current year, and to compensate for when it is not we filter out times in the future. This will produce
  163.                 # weirdness around the turning of the year, but hopefully survivable weirdness. There's nothing better that
  164.                 # we can do that immediately comes to mind.
  165.                 logzeit = int(time.mktime(time.strptime(f'{datetime.now().year} {mon} {day} {time}', '%Y %b %d %H:%M:%S')))
  166.                 if logzeit > time.time():
  167.                     continue # We could try to parse these by decrementing the year and giving it another go
  168.                 if logzeit - int(time.time()) > 60*60*3: # OOM killer in the last 3 hours
  169.                     continue
  170.                 anger = True
  171.                 ret.append(f'OOM killer at {time}')
  172.     return anger, ret
  173.  
  174. def trial_of_the_burro(jconf:Dict, fatal_only:bool=False) -> Tuple[bool, List[str]]:
  175.     # See if the system has an acceptable amount of free RAM.
  176.     # Use psutil.virtual_memory() for this.
  177.     anger = False
  178.     ret = []
  179.     if fatal_only:
  180.         thresh = 90
  181.     else:
  182.         thresh = 70
  183.     mem_used_percent = psutil.virtual_memory().percent
  184.     if mem_used_percent > thresh:
  185.         anger = True
  186.         ret.append(f'Memory usage at {mem_used_percent}, over threshold of {thresh}')
  187.     return anger, ret
  188.  
  189. def trial_of_loneliness(jconf:Dict, fatal_only:bool=False) -> Tuple[bool, List[str]]:
  190.     # See if processes necessary for this box are running.
  191.     # use psutil.process_iter() for this (in a loop)
  192.     anger = False
  193.     ret = []
  194.     seen = {}
  195.     for proc in psutil.process_iter(): # This iter, we just note all the process names active on the system
  196.         try:
  197.             info = proc.as_dict(attrs=['pid', 'name'])
  198.         except psutil.NoSuchProcess: # handle race conditions
  199.             pass
  200.         else:
  201.             seen[info['name']] = 1
  202.     for sought in jconf['processes']:
  203.         if sought not in seen:
  204.             anger = True
  205.             ret.append(f"Could not find process {sought}")
  206.     return anger, ret
  207.  
  208.  
  209. #####
  210. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement