Skip to content

Chapter 11.1: Service Level Indicators (SLI), Service Level Objectives (SLO), and Service Level Agreements (SLA)

Service Level concepts form the foundation of Site Reliability Engineering (SRE) practices. These metrics help organizations measure and manage service reliability.

1. Service Level Indicators (SLI)

Service Level Indicators are quantitative measures of service performance and user experience.

Common SLI Types

yaml
# Prometheus metrics for SLI tracking
- name: availability_sli
  expr: |
    (sum(rate(http_requests_total{status!~"5.."}[5m])) / 
     sum(rate(http_requests_total[5m]))) * 100

- name: latency_sli
  expr: |
    histogram_quantile(0.95, 
      sum(rate(http_request_duration_seconds_bucket[5m])) by (le))

- name: throughput_sli
  expr: |
    sum(rate(http_requests_total[5m]))

- name: error_rate_sli
  expr: |
    (sum(rate(http_requests_total{status=~"5.."}[5m])) / 
     sum(rate(http_requests_total[5m]))) * 100

SLI Implementation in Spring Boot

java
@Component
@Slf4j
public class SLICollector {
    
    private final MeterRegistry meterRegistry;
    private final Counter totalRequests;
    private final Counter errorRequests;
    private final Timer requestDuration;
    private final Gauge activeConnections;
    
    public SLICollector(MeterRegistry meterRegistry) {
        this.meterRegistry = meterRegistry;
        this.totalRequests = Counter.builder("http_requests_total")
            .description("Total HTTP requests")
            .register(meterRegistry);
        this.errorRequests = Counter.builder("http_errors_total")
            .description("Total HTTP errors")
            .register(meterRegistry);
        this.requestDuration = Timer.builder("http_request_duration")
            .description("HTTP request duration")
            .register(meterRegistry);
        this.activeConnections = Gauge.builder("active_connections")
            .description("Active connections")
            .register(meterRegistry, this, SLICollector::getActiveConnections);
    }
    
    @EventListener
    public void handleHttpRequest(HttpRequestEvent event) {
        totalRequests.increment(
            Tags.of(
                "method", event.getMethod(),
                "endpoint", event.getEndpoint(),
                "status", String.valueOf(event.getStatus())
            )
        );
        
        if (event.getStatus() >= 500) {
            errorRequests.increment(
                Tags.of(
                    "method", event.getMethod(),
                    "endpoint", event.getEndpoint()
                )
            );
        }
        
        requestDuration.record(event.getDuration(), TimeUnit.MILLISECONDS);
    }
    
    public double getAvailabilitySLI() {
        return calculateAvailability("5m");
    }
    
    public double getLatencySLI() {
        return calculateLatencyPercentile(95.0, "5m");
    }
    
    public double getErrorRateSLI() {
        return calculateErrorRate("5m");
    }
    
    private double calculateAvailability(String timeWindow) {
        // Implementation for availability calculation
        return 99.95; // Placeholder
    }
    
    private double calculateLatencyPercentile(double percentile, String timeWindow) {
        // Implementation for latency percentile calculation
        return 150.0; // Placeholder
    }
    
    private double calculateErrorRate(String timeWindow) {
        // Implementation for error rate calculation
        return 0.1; // Placeholder
    }
    
    private double getActiveConnections() {
        // Implementation for active connections count
        return 0.0;
    }
}

SLI Dashboard Configuration

json
{
  "dashboard": {
    "id": "sli-dashboard",
    "title": "Service Level Indicators",
    "panels": [
      {
        "title": "Availability SLI",
        "type": "stat",
        "targets": [
          {
            "expr": "availability_sli",
            "legendFormat": "Availability %"
          }
        ],
        "thresholds": [
          { "color": "red", "value": 99.0 },
          { "color": "yellow", "value": 99.9 },
          { "color": "green", "value": 99.95 }
        ]
      },
      {
        "title": "Latency SLI (95th percentile)",
        "type": "graph",
        "targets": [
          {
            "expr": "latency_sli",
            "legendFormat": "95th percentile"
          }
        ]
      },
      {
        "title": "Error Rate SLI",
        "type": "graph",
        "targets": [
          {
            "expr": "error_rate_sli",
            "legendFormat": "Error Rate %"
          }
        ]
      }
    ]
  }
}

2. Service Level Objectives (SLO)

Service Level Objectives define target performance levels for services based on SLI measurements.

SLO Configuration

yaml
slos:
  user_facing_service:
    availability:
      target: 99.9
      timeWindow: "30d"
      sli: "availability_sli"
    
    latency:
      target: 200
      timeWindow: "30d"
      sli: "latency_sli"
      unit: "ms"
    
    error_rate:
      target: 0.1
      timeWindow: "30d"
      sli: "error_rate_sli"
      unit: "%"

  background_service:
    availability:
      target: 99.0
      timeWindow: "30d"
      sli: "background_availability_sli"
    
    throughput:
      target: 1000
      timeWindow: "1h"
      sli: "background_throughput_sli"
      unit: "rps"

SLO Monitoring Implementation

java
@Service
@Slf4j
public class SLOMonitoringService {
    
    private final SLICollector sliCollector;
    private final AlertManager alertManager;
    private final Map<String, SLOConfiguration> sloConfigs;
    
    public SLOMonitoringService(SLICollector sliCollector, 
                               AlertManager alertManager,
                               SLOConfigurationProvider configProvider) {
        this.sliCollector = sliCollector;
        this.alertManager = alertManager;
        this.sloConfigs = configProvider.loadConfigurations();
    }
    
    @Scheduled(fixedDelay = 60000) // Check every minute
    public void checkSLOCompliance() {
        sloConfigs.forEach((service, config) -> {
            checkServiceSLO(service, config);
        });
    }
    
    private void checkServiceSLO(String service, SLOConfiguration config) {
        config.getSLOs().forEach(slo -> {
            double currentValue = getCurrentSLIValue(slo.getSliName());
            double errorBudget = calculateErrorBudget(slo, currentValue);
            
            SLOStatus status = evaluateSLOStatus(slo, currentValue, errorBudget);
            
            if (status.requiresAlert()) {
                alertManager.sendAlert(createSLOAlert(service, slo, status));
            }
            
            log.info("SLO Check - Service: {}, SLO: {}, Current: {}, Target: {}, Status: {}", 
                service, slo.getName(), currentValue, slo.getTarget(), status);
        });
    }
    
    public double calculateErrorBudget(SLO slo, double currentValue) {
        if (slo.getType() == SLOType.AVAILABILITY) {
            return calculateAvailabilityErrorBudget(slo, currentValue);
        } else if (slo.getType() == SLOType.LATENCY) {
            return calculateLatencyErrorBudget(slo, currentValue);
        } else if (slo.getType() == SLOType.ERROR_RATE) {
            return calculateErrorRateErrorBudget(slo, currentValue);
        }
        return 0.0;
    }
    
    private double calculateAvailabilityErrorBudget(SLO slo, double currentAvailability) {
        double targetAvailability = slo.getTarget();
        double allowedDowntime = (100.0 - targetAvailability) / 100.0;
        double actualDowntime = (100.0 - currentAvailability) / 100.0;
        
        return Math.max(0, (allowedDowntime - actualDowntime) / allowedDowntime * 100);
    }
    
    private SLOAlert createSLOAlert(String service, SLO slo, SLOStatus status) {
        return SLOAlert.builder()
            .service(service)
            .sloName(slo.getName())
            .severity(determineSeverity(status))
            .message(String.format("SLO %s for service %s is %s. Current: %.2f, Target: %.2f", 
                slo.getName(), service, status.getDescription(), 
                status.getCurrentValue(), slo.getTarget()))
            .errorBudgetRemaining(status.getErrorBudgetRemaining())
            .timestamp(Instant.now())
            .build();
    }
}

Error Budget Calculation

java
@Component
public class ErrorBudgetCalculator {
    
    public ErrorBudgetReport calculateErrorBudget(String service, Duration timeWindow) {
        LocalDateTime endTime = LocalDateTime.now();
        LocalDateTime startTime = endTime.minus(timeWindow);
        
        // Get SLO configuration for the service
        SLOConfiguration config = getSLOConfiguration(service);
        
        ErrorBudgetReport.Builder reportBuilder = ErrorBudgetReport.builder()
            .service(service)
            .timeWindow(timeWindow)
            .calculationTime(endTime);
        
        config.getSLOs().forEach(slo -> {
            ErrorBudgetDetails budget = calculateSLOErrorBudget(slo, startTime, endTime);
            reportBuilder.addErrorBudget(slo.getName(), budget);
        });
        
        return reportBuilder.build();
    }
    
    private ErrorBudgetDetails calculateSLOErrorBudget(SLO slo, 
                                                      LocalDateTime startTime, 
                                                      LocalDateTime endTime) {
        switch (slo.getType()) {
            case AVAILABILITY:
                return calculateAvailabilityErrorBudget(slo, startTime, endTime);
            case LATENCY:
                return calculateLatencyErrorBudget(slo, startTime, endTime);
            case ERROR_RATE:
                return calculateErrorRateErrorBudget(slo, startTime, endTime);
            default:
                throw new IllegalArgumentException("Unsupported SLO type: " + slo.getType());
        }
    }
    
    private ErrorBudgetDetails calculateAvailabilityErrorBudget(SLO slo, 
                                                               LocalDateTime startTime, 
                                                               LocalDateTime endTime) {
        Duration totalDuration = Duration.between(startTime, endTime);
        Duration allowedDowntime = totalDuration.multipliedBy(
            (long)((100.0 - slo.getTarget()) * 100)) // Convert percentage
            .dividedBy(10000);
        
        Duration actualDowntime = getActualDowntime(startTime, endTime);
        Duration remainingErrorBudget = allowedDowntime.minus(actualDowntime);
        
        double budgetUsedPercentage = actualDowntime.toMillis() * 100.0 / allowedDowntime.toMillis();
        
        return ErrorBudgetDetails.builder()
            .sloName(slo.getName())
            .totalBudget(allowedDowntime)
            .usedBudget(actualDowntime)
            .remainingBudget(remainingErrorBudget)
            .budgetUsedPercentage(budgetUsedPercentage)
            .status(remainingErrorBudget.isNegative() ? 
                ErrorBudgetStatus.EXHAUSTED : ErrorBudgetStatus.HEALTHY)
            .build();
    }
}

3. Service Level Agreements (SLA)

Service Level Agreements are contractual commitments to customers regarding service performance.

SLA Management System

java
@Entity
@Table(name = "service_level_agreements")
public class ServiceLevelAgreement {
    
    @Id
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    private Long id;
    
    @Column(nullable = false)
    private String customerId;
    
    @Column(nullable = false)
    private String serviceName;
    
    @Column(nullable = false)
    private String tierLevel; // Premium, Standard, Basic
    
    @ElementCollection
    @CollectionTable(name = "sla_commitments")
    private List<SLACommitment> commitments;
    
    @Column(nullable = false)
    private LocalDateTime effectiveDate;
    
    @Column
    private LocalDateTime expirationDate;
    
    @Enumerated(EnumType.STRING)
    private SLAStatus status;
    
    // Constructors, getters, setters
}

@Embeddable
public class SLACommitment {
    
    @Column(nullable = false)
    private String metric; // availability, response_time, throughput
    
    @Column(nullable = false)
    private Double target;
    
    @Column
    private String unit;
    
    @Column
    private String measurementWindow;
    
    @Column
    private Double penalty; // Penalty percentage for SLA breach
    
    @Column
    private Double credit; // Service credit for SLA breach
    
    // Constructors, getters, setters
}

SLA Monitoring and Reporting

java
@Service
@Slf4j
public class SLAMonitoringService {
    
    private final SLARepository slaRepository;
    private final SLICollector sliCollector;
    private final NotificationService notificationService;
    private final SLAReportGenerator reportGenerator;
    
    @Scheduled(cron = "0 0 * * * *") // Every hour
    public void monitorSLACompliance() {
        List<ServiceLevelAgreement> activeSLAs = slaRepository.findByStatus(SLAStatus.ACTIVE);
        
        activeSLAs.forEach(this::checkSLACompliance);
    }
    
    private void checkSLACompliance(ServiceLevelAgreement sla) {
        sla.getCommitments().forEach(commitment -> {
            SLAComplianceResult result = evaluateCommitment(sla, commitment);
            
            if (!result.isCompliant()) {
                handleSLABreach(sla, commitment, result);
            }
            
            // Store compliance data for reporting
            storeSLAComplianceData(sla, commitment, result);
        });
    }
    
    private SLAComplianceResult evaluateCommitment(ServiceLevelAgreement sla, 
                                                  SLACommitment commitment) {
        String serviceName = sla.getServiceName();
        String metricName = commitment.getMetric();
        
        // Get current metric value
        double currentValue = getCurrentMetricValue(serviceName, metricName, 
                                                   commitment.getMeasurementWindow());
        
        boolean isCompliant = isCommitmentMet(commitment, currentValue);
        
        return SLAComplianceResult.builder()
            .slaId(sla.getId())
            .customerId(sla.getCustomerId())
            .serviceName(serviceName)
            .metric(metricName)
            .target(commitment.getTarget())
            .actualValue(currentValue)
            .compliant(isCompliant)
            .measurementTime(LocalDateTime.now())
            .build();
    }
    
    private void handleSLABreach(ServiceLevelAgreement sla, 
                                SLACommitment commitment, 
                                SLAComplianceResult result) {
        // Create SLA breach event
        SLABreachEvent event = SLABreachEvent.builder()
            .slaId(sla.getId())
            .customerId(sla.getCustomerId())
            .serviceName(sla.getServiceName())
            .metric(commitment.getMetric())
            .target(commitment.getTarget())
            .actualValue(result.getActualValue())
            .breachTime(LocalDateTime.now())
            .severity(calculateBreachSeverity(commitment, result))
            .build();
        
        // Notify stakeholders
        notificationService.notifySLABreach(event);
        
        // Calculate service credits if applicable
        if (commitment.getCredit() != null) {
            calculateServiceCredits(sla, commitment, result);
        }
        
        log.warn("SLA Breach detected - Customer: {}, Service: {}, Metric: {}, Target: {}, Actual: {}", 
            sla.getCustomerId(), sla.getServiceName(), commitment.getMetric(), 
            commitment.getTarget(), result.getActualValue());
    }
    
    @Scheduled(cron = "0 0 0 * * MON") // Weekly reports
    public void generateWeeklySLAReports() {
        List<ServiceLevelAgreement> activeSLAs = slaRepository.findByStatus(SLAStatus.ACTIVE);
        
        activeSLAs.forEach(sla -> {
            SLAReport report = reportGenerator.generateWeeklyReport(sla);
            sendReportToCustomer(sla.getCustomerId(), report);
        });
    }
}

SLA Reporting Dashboard

yaml
# Grafana dashboard for SLA monitoring
apiVersion: v1
kind: ConfigMap
metadata:
  name: sla-dashboard-config
data:
  dashboard.json: |
    {
      "dashboard": {
        "title": "SLA Compliance Dashboard",
        "panels": [
          {
            "title": "SLA Compliance by Service",
            "type": "table",
            "targets": [
              {
                "expr": "sla_compliance_status",
                "format": "table"
              }
            ]
          },
          {
            "title": "SLA Breach Count",
            "type": "stat",
            "targets": [
              {
                "expr": "increase(sla_breach_total[24h])",
                "legendFormat": "24h Breaches"
              }
            ]
          },
          {
            "title": "Service Credits Issued",
            "type": "graph",
            "targets": [
              {
                "expr": "sum(sla_service_credits_total) by (customer)",
                "legendFormat": "Credits - {{customer}}"
              }
            ]
          }
        ]
      }
    }

4. Best Practices

4.1 SLI Selection Guidelines

yaml
sli_selection_guidelines:
  user_facing_services:
    primary_slis:
      - availability
      - latency
      - error_rate
    secondary_slis:
      - throughput
      - data_quality
  
  data_processing_services:
    primary_slis:
      - data_freshness
      - processing_completeness
      - error_rate
    secondary_slis:
      - throughput
      - resource_utilization
  
  storage_services:
    primary_slis:
      - availability
      - durability
      - consistency
    secondary_slis:
      - latency
      - throughput

4.2 SLO Setting Strategy

java
@Component
public class SLOOptimizer {
    
    public SLORecommendation recommendSLO(String service, String metric, 
                                         Duration analysisWindow) {
        // Analyze historical performance
        HistoricalPerformance historical = analyzeHistoricalPerformance(
            service, metric, analysisWindow);
        
        // Calculate realistic targets
        double p99Performance = historical.getPercentile(99.0);
        double p95Performance = historical.getPercentile(95.0);
        double p90Performance = historical.getPercentile(90.0);
        
        // Recommend target based on business criticality
        ServiceCriticality criticality = getServiceCriticality(service);
        
        double recommendedTarget = switch (criticality) {
            case CRITICAL -> p95Performance * 0.95; // More aggressive
            case IMPORTANT -> p90Performance * 0.98; // Balanced
            case STANDARD -> p90Performance * 1.02; // Conservative
        };
        
        return SLORecommendation.builder()
            .service(service)
            .metric(metric)
            .recommendedTarget(recommendedTarget)
            .confidence(calculateConfidence(historical))
            .reasoning(generateReasoning(historical, criticality))
            .build();
    }
}

4.3 Error Budget Policies

yaml
error_budget_policies:
  fast_burn:
    detection_window: "1h"
    threshold: "2%" # 2% error budget consumed in 1 hour
    actions:
      - alert_on_call
      - stop_deployments
      - enable_feature_flags
  
  slow_burn:
    detection_window: "6h" 
    threshold: "5%" # 5% error budget consumed in 6 hours
    actions:
      - alert_team
      - schedule_review
  
  budget_exhausted:
    threshold: "100%"
    actions:
      - stop_all_deployments
      - activate_incident_response
      - notify_leadership

This comprehensive guide provides the foundation for implementing effective SLI/SLO/SLA practices in your organization. The next sections will cover Incident Management, Chaos Engineering, and Capacity Planning to complete your SRE implementation.

Created by Eren Demir.