#! /usr/bin/env python # # $Id: cron.py 6813 2009-07-07 18:54:12Z robin $ # # Tasks which are to be done on a regular basis from cron. import os import sys import util import config import execute import control import time import shutil # Triggers all activity which is to be done regularly via cron. def doCron(): if config.Config.cronenabled == "0": return config.Config.config["cron"] = "1" # Flag to indicate that we're running from cron. if not util.lock(): return util.bufferOutput() # Check whether nodes are still running an restart if neccessary. for (node, isrunning) in control.isRunning(config.Config.nodes()): if not isrunning and node.hasCrashed(): control.start([node]) # Check for dead hosts. _checkHosts() # Generate statistics. _logStats(5) # Check available disk space. _checkDiskSpace() # Expire old log files. _expireLogs() # Update the HTTP stats directory. _updateHTTPStats() # Run external command if we have one. if config.Config.croncmd: execute.runLocalCmd(config.Config.croncmd) # Mail potential output. output = util.getBufferedOutput() if output: util.sendMail("cron: " + output.split("\n")[0], output) util.unlock() config.Config.config["cron"] = "0" def logAction(node, action): t = time.time() out = open(config.Config.statslog, "a") print >>out, t, node.tag, "action", action out.close() def _logStats(interval): nodes = config.Config.nodes() top = control.getTopOutput(nodes) have_cflow = config.Config.cflowaddress and config.Config.cflowuser and config.Config.cflowpassword have_capstats = config.Config.capstats cflow_start = cflow_end = None capstats = [] cflow_rates = [] if have_cflow: cflow_start = control.getCFlowStatus() if have_capstats: capstats = control.getCapstatsOutput(nodes, interval) elif have_cflow: time.sleep(interval) if have_cflow: cflow_end = control.getCFlowStatus() if cflow_start and cflow_end: cflow_rates = control.calculateCFlowRate(cflow_start, cflow_end, interval) t = time.time() out = open(config.Config.statslog, "a") for (node, error, vals) in top: if not error: for proc in vals: type = proc["proc"] for (val, key) in proc.items(): if val != "proc": print >>out, t, node.tag, type, val, key else: print >>out, t, node.tag, "error", "error", error for (node, error, vals) in capstats: if not error: for (key, val) in vals.items(): # Report if we don't see packets on an interface. tag = "lastpkts-%s" % node.tag if key == "pkts": if tag in config.Config.state: last = float(config.Config.state[tag]) else: last = -1.0 if float(val) == 0.0 and last != 0.0: util.output("%s is not seeing any packets on interface %s" % (node.host, node.interface)) if float(val) != 0.0 and last == 0.0: util.output("%s is seeing packets again on interface %s" % (node.host, node.interface)) config.Config._setState(tag, val) print >>out, t, node.tag, "interface", key, val else: print >>out, t, node.tag, "error", "error", error for (port, error, vals) in cflow_rates: if not error: for (key, val) in vals.items(): print >>out, t, "cflow", port.lower(), key, val out.close() def _checkDiskSpace(): minspace = float(config.Config.mindiskspace) if minspace == 0.0: return for (node, dfs) in control.getDf(config.Config.nodes()).items(): for df in dfs: fs = df[0] total = float(df[1]) used = float(df[2]) avail = float(df[3]) perc = used * 100.0 / (used + avail) key = "disk-space-%s%s" % (node, fs.replace("/", "-")) if perc > 100 - minspace: try: if float(config.Config.state[key]) > 100 - minspace: # Already reported. continue except KeyError: pass util.output("Disk space low on %s:%s - %.1f%% used." % (node, fs, perc)) config.Config.state[key] = "%.1f" % perc def _expireLogs(): i = int(config.Config.logexpireinterval) if not i: return (success, output) = execute.runLocalCmd(os.path.join(config.Config.scriptsdir, "expire-logs")) if not success: util.output("error running expire-logs\n\n") util.output(output) def _checkHosts(): for node in config.Config.hosts(): tag = "alive-%s" % node.host alive = execute.isAlive(node.addr) and "1" or "0" if tag in config.Config.state: previous = config.Config.state[tag] if alive != previous: util.output("host %s %s" % (node.host, alive == "1" and "up" or "down")) config.Config._setState(tag, alive) def _getProfLogs(): cmds = [] for node in config.Config.hosts(): cmd = os.path.join(config.Config.scriptsdir, "get-prof-log") + " %s %s %s/prof.log" % (node.tag, node.host, node.cwd()) cmds += [(node, cmd, [], None)] for (node, success, output) in execute.runLocalCmdsParallel(cmds): if not success: util.output("cannot get prof.log from %s" % node.tag) def _updateHTTPStats(): # Get the prof.logs. _getProfLogs() # Create meta file. meta = open(os.path.join(config.Config.statsdir, "meta.dat"), "w") for node in config.Config.hosts(): print >>meta, "node", node.tag, node.type, node.host print >>meta, "time", time.asctime() print >>meta, "version", config.Config.version try: print >>meta, "os", execute.captureCmd("uname -a")[1][0] except IndexError: print >>meta, "os " try: print >>meta, "host", execute.captureCmd("hostname")[1][0] except IndexError: print >>meta, "host " meta.close() # Run the update-stats script. (success, output) = execute.runLocalCmd(os.path.join(config.Config.scriptsdir, "update-stats")) if not success: util.output("error running update-stats\n\n") util.output(output)