Detect missed executions with OpenNMS

Everyone knows that OpenNMS is a powerful monitoring solution, but not everyone knows that since version 1.10 circa it embeds the Drools rule processing engine. Drools programs can then be used to extend the event handling logic in new and powerful ways.

The following example shows how OpenNMS can be extended to detect missed executions for recurring activities like backups or scheduled jobs.

The core functionality is implemented in the following Drools program (commented below):

package org.opennms.netmgt.correlation.drools;
import java.util.Date;
import java.util.Calendar;
import org.opennms.netmgt.correlation.drools.DroolsCorrelationEngine;
import org.opennms.netmgt.xml.event.Event;
import org.opennms.netmgt.xml.event.Parms;
import org.opennms.netmgt.xml.event.Parm;
import org.opennms.netmgt.xml.event.Value;
import org.opennms.netmgt.model.events.EventBuilder;
global org.opennms.netmgt.correlation.drools.DroolsCorrelationEngine engine;
global org.opennms.netmgt.correlation.drools.NodeService nodeService;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Iterator;
global java.lang.Integer REPEATING_INTERVAL; // in hours
declare Execution
nodeid : Long
uei : String
tag : String
interval : String
expireTimerId : Integer
// for passive status events
passiveIpAddr: String
passiveServiceName: String
passiveNodeLabel: String
end
/*
* Initial execution event for a node - send the initial translated event to generate notification
*/
rule "initial event received"
when
$e : Event( $uei : uei, $nodeid : nodeid )
eval( "every".equals($e.getParm("every").getValue().getContent()) ) // filter out internally generated 'missed' events
then
Execution execution = new Execution();
execution.setNodeid( $nodeid );
execution.setUei( $uei );
execution.setTag( getTag($e) );
execution.setInterval( $e.getParm("interval").getValue().getContent() );
execution.setExpireTimerId( engine.setTimer( getInterval($e, REPEATING_INTERVAL) ) );
// handle passive status events
if($uei.indexOf("passiveServiceStatus")!=-1) {
execution.setPassiveIpAddr($e.getParm("passiveIpAddr").getValue().getContent());
execution.setPassiveServiceName($e.getParm("passiveServiceName").getValue().getContent());
execution.setPassiveNodeLabel($e.getParm("passiveNodeLabel").getValue().getContent());
}
insert( execution );
// the event can be retracted, or the subsequent backup completed rule will fire
retract( $e );
println( "Initial event tag="+getTag($e)+" event " + $uei + " for node " + $nodeid );
end
/*
* Subsequent event completed
*/
rule "subsequent event completed"
when
$e : Event( $uei : uei, $nodeid : nodeid )
$execution : Execution( nodeid == $nodeid, $expireTimerId : expireTimerId )
eval( $execution.getTag().equals( getTag($e) ) )
eval( "every".equals($e.getParm("every").getValue().getContent()) ) // filter out internally generated 'missed' events
then
retract( $e );
engine.cancelTimer($expireTimerId);
$execution.setExpireTimerId( engine.setTimer( getInterval($e, REPEATING_INTERVAL) ) );
update( $execution );
println( "Subsequent execution event " + $uei + " for node " + $nodeid +" supressed." );
end
/*
* Expiration timer expires: warn user that another backup event was not received in the expected interval
*/
rule "timer expired"
when
$execution : Execution( $tag: tag, $nodeid : nodeid, $expireTimerId : expireTimerId, $uei : uei, $interval: interval, $passiveNodeLabel: passiveNodeLabel, $passiveIpAddr: passiveIpAddr, $passiveServiceName: passiveServiceName)
$expire : TimerExpired( id == $expireTimerId )
then
sendExecutionMissedEvent(engine, $nodeid, $uei, $tag , $interval, $passiveIpAddr, $passiveServiceName, $passiveNodeLabel);
retract( $execution );
retract( $expire );
println( "Event execution expiration for " + $uei + " for node " + $nodeid +"["+$tag+"]." );
end
/*
* Utility to send a (failed) execution event.
*/
function void sendExecutionMissedEvent( DroolsCorrelationEngine engine, Long nodeId, String uei, String tag, String interval, String passiveIpAddr, String passiveServiceName, String passiveNodeLabel) {
EventBuilder bldr = new EventBuilder(uei.replaceAll("Normal","Warning"), "Drools"); // clone current event
bldr.setNodeid(nodeId.intValue());
bldr.addParam("correlationEngineName", "Drools");
bldr.addParam("correlationRuleSetName", engine.getName());
bldr.addParam("correlationComments", "RepeatingBackupRules");
if(uei.indexOf("job")!=-1) {
bldr.addParam("job", tag);
} else {
bldr.addParam("backupset", tag);
}
if(uei.indexOf("passiveServiceStatus")!=-1) {
bldr.addParam("passiveStatus", "Down");
bldr.addParam("passiveIpAddr", passiveIpAddr);
bldr.addParam("passiveServiceName", passiveServiceName);
bldr.addParam("passiveNodeLabel", passiveNodeLabel);
bldr.addParam("passiveReasonCode", "no events received and timer expired");
}
bldr.addParam("tag", tag);
bldr.addParam("interval", interval);
bldr.addParam("every", "missed"); // this will be used to discriminate between normal failures (->"every") and missed executions (->"missed")
engine.sendEvent(bldr.getEvent());
}
function String getTag(Event e) {
String tag=null;
Parm p=e.getParm("backupset");
if(p!=null) {
tag=p.getValue().getContent();
}
p=e.getParm("job");
if(p!=null) {
tag=p.getValue().getContent();
}
p=e.getParm("tag");
if( p!=null && tag==null) {
tag=p.getValue().getContent();
}
return tag;
}
function long getInterval(Event e, Integer defaultInterval) {
long interval = defaultInterval.intValue(); // default, in hours
Parm p=e.getParm("interval");
if(p!=null) {
String value=p.getValue().getContent();
if(value!=null && value.indexOf(":")!=-1) {
println("This event has an interval modifier: "+value);
// value contains a modifier, parse it
String[] values=value.split(":", 2);
try {
interval=new Integer(values[0]).intValue();
} catch(Exception exc) {
println("Error parsing interval value="+value+" to integer. Using default REPEATING_INTERVAL");
}
// now find out which modifier was used
if("weekdays".equals(values[1])) {
println("This event has a weekdays modifier");
// the job is only meant to be run on weekdays: this means that any execution whose timer
// would expire on weekends (sat,sun) should be moved up to monday + interval
// example: on friday night at 23:00 a daily backup runs, interval is: 36:weekdays
// since the timer would expire on sunday at 11.00 it is moved up to tuesday at 11:00 (monday 23:00 + 36)
Calendar cal = Calendar.getInstance();
cal.add(Calendar.HOUR_OF_DAY, (int)interval);
int dow = cal.get(Calendar.DAY_OF_WEEK);
if(dow == Calendar.SATURDAY || dow == Calendar.SUNDAY || dow == Calendar.FRIDAY) {
// timer would expire on saturday or sunday, move it up to monday then add interval again
println("Timer would expire on sat/sun, moving it up to monday");
cal = Calendar.getInstance();
cal.roll(Calendar.DAY_OF_YEAR, true);
while(cal.get(Calendar.DAY_OF_WEEK) != Calendar.MONDAY) {
cal.roll(Calendar.DAY_OF_YEAR, true);
}
// now add interval
cal.add(Calendar.HOUR_OF_DAY, (int)interval);
}
println("Event timer has been moved up until: "+cal.getTime());
interval = (cal.getTimeInMillis() - Calendar.getInstance().getTimeInMillis());
println("Interval in ms: "+interval);
}
} else {
try {
interval = new Integer(value).intValue();
interval = interval * 60 * 60 * 1000; // hours -> milliseconds
} catch(Exception exc) {
println("Error parsing interval value="+value+" to integer. Using default REPEATING_INTERVAL");
}
}
}
return interval;
}
/*
* println utility
*/
function void println(Object msg) {
System.err.println(new Date() + " RepeatingBackups : " + msg);
}

First we need to define (at least) two UEIs: one for uei.<yournamespace>/job/recurring/Warning and one for uei.<yournamespace>/job/recurring/Normal. The events must be configured so that a Normal event clears any previous Warning. At the moment I feed these events into OpenNMS using syslog, but I am planning to replace syslog with my sendevent web-hook.
Each event carries three additional params (visible in the screenshot above):

  1. job or backupset : carries the job or backupset name, because one host can execute multiple jobs. It must be used in the event reduction key to achieve correct warnings resolution
  2. every : the value 'every' means it is an externally submitted event while 'missed' is used with events generated internally by expired Drools timers (missed executions). Every can be used as varbind filter to implement different notifications for 'regular' failures and missed execution failures
  3. interval: positive integer value indicating the repeating interval in hours (24 for daily jobs, 1 for hourly jobs, and so on)
Note that with this setup a successful execution will also clear any missed execution alarm.

As for the drools program the relevant parts are: the definition of the Execution fact. Execution carries the data necessary to identify the node and job plus the timer set to the interval value of the event.

The 2 following rules define the handling of the initial and subsequent events while the third handles the expiration of an interval. The code should be self-explanatory, ask in the comments if you need help.

Popular posts

Mirth: recover space when mirthdb grows out of control

1/4/2000 to 1/4/2025: the beginning

From 0 to ZFS replication in 5m with syncoid