Chaos Engineering
Chaos Engineering, sistemlerin dayanıklılığını test etmek ve güçlendirmek için kontrollü şekilde hatalar oluşturma disiplinidir. Bu yaklaşım, sistemlerin gerçek dünya koşullarındaki beklenmedik durumlara karşı nasıl davrandığını anlamaya yardımcı olur.
Chaos Engineering Temelleri
Chaos Engineering Prensipleri
1. Steady State Hypothesis
java
// Chaos Engineering Hypothesis Framework
@Component
public class SteadyStateValidator {
public SteadyStateHypothesis defineHypothesis(String serviceName) {
return SteadyStateHypothesis.builder()
.serviceName(serviceName)
.baseline(defineBaseline(serviceName))
.tolerances(defineTolerance(serviceName))
.duration(Duration.ofMinutes(30))
.build();
}
private ServiceBaseline defineBaseline(String serviceName) {
// Son 30 günün metriklerinden baseline hesapla
List<ServiceMetrics> historicalMetrics = metricsService.getHistoricalMetrics(
serviceName, Duration.ofDays(30)
);
return ServiceBaseline.builder()
.averageLatency(calculateAverageLatency(historicalMetrics))
.averageThroughput(calculateAverageThroughput(historicalMetrics))
.errorRate(calculateAverageErrorRate(historicalMetrics))
.availability(calculateAvailability(historicalMetrics))
.build();
}
private ServiceTolerance defineTolerance(String serviceName) {
return ServiceTolerance.builder()
.latencyTolerance(0.2) // %20 tolerance
.throughputTolerance(0.15) // %15 tolerance
.errorRateTolerance(0.1) // %10 tolerance
.availabilityTolerance(0.01) // %1 tolerance
.build();
}
public boolean validateSteadyState(SteadyStateHypothesis hypothesis, ServiceMetrics currentMetrics) {
ServiceBaseline baseline = hypothesis.getBaseline();
ServiceTolerance tolerance = hypothesis.getTolerance();
// Latency validation
if (!isWithinTolerance(currentMetrics.getLatency(), baseline.getAverageLatency(), tolerance.getLatencyTolerance())) {
return false;
}
// Throughput validation
if (!isWithinTolerance(currentMetrics.getThroughput(), baseline.getAverageThroughput(), tolerance.getThroughputTolerance())) {
return false;
}
// Error rate validation
if (!isWithinTolerance(currentMetrics.getErrorRate(), baseline.getErrorRate(), tolerance.getErrorRateTolerance())) {
return false;
}
return true;
}
}
2. Experiment Design
java
// Chaos Experiment Framework
@Entity
public class ChaosExperiment {
@Id
private String id;
private String name;
private String description;
private SteadyStateHypothesis hypothesis;
private ExperimentMethod method;
private ExperimentScope scope;
private Duration duration;
private ExperimentStatus status;
private List<ExperimentResult> results;
// Constructor, getters, setters
}
@Service
public class ChaosExperimentService {
public ChaosExperiment createExperiment(ExperimentRequest request) {
ChaosExperiment experiment = ChaosExperiment.builder()
.id(UUID.randomUUID().toString())
.name(request.getName())
.description(request.getDescription())
.hypothesis(steadyStateValidator.defineHypothesis(request.getServiceName()))
.method(request.getMethod())
.scope(defineScope(request))
.duration(request.getDuration())
.status(ExperimentStatus.PLANNED)
.build();
return chaosExperimentRepository.save(experiment);
}
public ExperimentResult executeExperiment(String experimentId) {
ChaosExperiment experiment = chaosExperimentRepository.findById(experimentId)
.orElseThrow(() -> new ExperimentNotFoundException(experimentId));
// Pre-experiment validation
if (!validatePreConditions(experiment)) {
throw new ExperimentPreConditionFailedException("Pre-conditions not met");
}
experiment.setStatus(ExperimentStatus.RUNNING);
try {
// Execute chaos injection
ChaosInjectionResult injectionResult = executeInjection(experiment);
// Monitor system behavior
List<ServiceMetrics> monitoringResults = monitorSystemBehavior(experiment);
// Validate hypothesis
boolean hypothesisValid = validateHypothesis(experiment, monitoringResults);
// Create result
ExperimentResult result = ExperimentResult.builder()
.experimentId(experimentId)
.injectionResult(injectionResult)
.monitoringResults(monitoringResults)
.hypothesisValid(hypothesisValid)
.executedAt(Instant.now())
.build();
experiment.addResult(result);
experiment.setStatus(ExperimentStatus.COMPLETED);
return result;
} catch (Exception e) {
experiment.setStatus(ExperimentStatus.FAILED);
throw new ExperimentExecutionException("Experiment failed", e);
} finally {
// Cleanup chaos injection
cleanupInjection(experiment);
}
}
}
Chaos Monkey Implementation
1. Service Termination
java
// Chaos Monkey Service Killer
@Component
public class ServiceTerminationChaos {
@Autowired
private KubernetesClient kubernetesClient;
@Autowired
private ChaosConfigurationService configService;
@Scheduled(cron = "0 */30 9-17 * * MON-FRI") // İş saatlerinde 30 dakikada bir
public void randomServiceTermination() {
ChaosConfiguration config = configService.getChaosConfiguration();
if (!config.isServiceTerminationEnabled()) {
return;
}
List<String> eligibleServices = getEligibleServices(config);
if (eligibleServices.isEmpty()) {
log.info("No eligible services for chaos termination");
return;
}
String targetService = selectRandomService(eligibleServices);
terminateRandomInstance(targetService);
}
private List<String> getEligibleServices(ChaosConfiguration config) {
return kubernetesClient.apps().deployments()
.inNamespace(config.getTargetNamespace())
.list()
.getItems()
.stream()
.filter(deployment -> isEligibleForChaos(deployment, config))
.map(deployment -> deployment.getMetadata().getName())
.collect(Collectors.toList());
}
private boolean isEligibleForChaos(Deployment deployment, ChaosConfiguration config) {
Map<String, String> labels = deployment.getMetadata().getLabels();
// Chaos engineering'e açık servisler
if (!"true".equals(labels.get("chaos.enabled"))) {
return false;
}
// Minimum replica sayısı kontrolü
int replicas = deployment.getSpec().getReplicas();
if (replicas < config.getMinimumReplicasForChaos()) {
return false;
}
// Production critical servisleri hariç tut
if ("critical".equals(labels.get("service.tier"))) {
return false;
}
return true;
}
private void terminateRandomInstance(String serviceName) {
List<Pod> pods = kubernetesClient.pods()
.inNamespace("default")
.withLabel("app", serviceName)
.list()
.getItems();
if (pods.isEmpty()) {
log.warn("No pods found for service: {}", serviceName);
return;
}
Pod targetPod = pods.get(random.nextInt(pods.size()));
log.info("Chaos Monkey: Terminating pod {} of service {}",
targetPod.getMetadata().getName(), serviceName);
// Pod'u terminate et
kubernetesClient.pods()
.inNamespace("default")
.withName(targetPod.getMetadata().getName())
.delete();
// Event'i log'la
chaosEventLogger.logTerminationEvent(serviceName, targetPod.getMetadata().getName());
// Metrics'i update et
chaosMetrics.incrementTerminationCounter(serviceName);
}
}
2. Network Latency Injection
java
// Network Chaos Implementation
@Component
public class NetworkLatencyChaos {
public ChaosInjectionResult injectNetworkLatency(NetworkLatencyConfig config) {
String targetPod = config.getTargetPod();
Duration latency = config.getLatency();
double jitter = config.getJitter();
// tc (traffic control) command ile latency inject et
String tcCommand = String.format(
"tc qdisc add dev eth0 root netem delay %dms %dms",
latency.toMillis(),
(long)(latency.toMillis() * jitter)
);
ExecutionResult result = kubernetesExecutor.executeCommand(targetPod, tcCommand);
if (result.isSuccess()) {
log.info("Network latency injected: {}ms ±{}ms on pod {}",
latency.toMillis(), (long)(latency.toMillis() * jitter), targetPod);
}
return ChaosInjectionResult.builder()
.type(ChaosType.NETWORK_LATENCY)
.target(targetPod)
.configuration(config)
.success(result.isSuccess())
.message(result.getOutput())
.injectedAt(Instant.now())
.build();
}
public void removeNetworkLatency(String targetPod) {
String cleanupCommand = "tc qdisc del dev eth0 root";
ExecutionResult result = kubernetesExecutor.executeCommand(targetPod, cleanupCommand);
if (result.isSuccess()) {
log.info("Network latency removed from pod {}", targetPod);
}
}
}
3. Resource Exhaustion
java
// CPU ve Memory Stress Testing
@Component
public class ResourceExhaustionChaos {
public ChaosInjectionResult injectCPUStress(CPUStressConfig config) {
String targetPod = config.getTargetPod();
int cpuPercentage = config.getCpuPercentage();
Duration duration = config.getDuration();
// stress-ng kullanarak CPU stress oluştur
String stressCommand = String.format(
"stress-ng --cpu %d --cpu-load %d --timeout %ds",
Runtime.getRuntime().availableProcessors(),
cpuPercentage,
duration.getSeconds()
);
// Background'da çalıştır
CompletableFuture<ExecutionResult> futureResult =
kubernetesExecutor.executeCommandAsync(targetPod, stressCommand);
return ChaosInjectionResult.builder()
.type(ChaosType.CPU_STRESS)
.target(targetPod)
.configuration(config)
.success(true)
.message(String.format("CPU stress started: %d%% for %d seconds", cpuPercentage, duration.getSeconds()))
.injectedAt(Instant.now())
.futureResult(futureResult)
.build();
}
public ChaosInjectionResult injectMemoryStress(MemoryStressConfig config) {
String targetPod = config.getTargetPod();
String memorySize = config.getMemorySize(); // "512M", "1G" etc.
Duration duration = config.getDuration();
String stressCommand = String.format(
"stress-ng --vm 1 --vm-bytes %s --timeout %ds",
memorySize,
duration.getSeconds()
);
CompletableFuture<ExecutionResult> futureResult =
kubernetesExecutor.executeCommandAsync(targetPod, stressCommand);
return ChaosInjectionResult.builder()
.type(ChaosType.MEMORY_STRESS)
.target(targetPod)
.configuration(config)
.success(true)
.message(String.format("Memory stress started: %s for %d seconds", memorySize, duration.getSeconds()))
.injectedAt(Instant.now())
.futureResult(futureResult)
.build();
}
}
Gremlin Integration
1. Gremlin API Integration
java
// Gremlin Service Integration
@Service
public class GremlinChaosService {
@Value("${gremlin.api.key}")
private String apiKey;
@Value("${gremlin.team.id}")
private String teamId;
private final WebClient gremlinClient;
public GremlinChaosService() {
this.gremlinClient = WebClient.builder()
.baseUrl("https://api.gremlin.com/v1")
.defaultHeader("Authorization", "Key " + apiKey)
.build();
}
public GremlinAttack createCPUAttack(GremlinCPUAttackRequest request) {
return gremlinClient.post()
.uri("/attacks")
.bodyValue(buildCPUAttackPayload(request))
.retrieve()
.bodyToMono(GremlinAttack.class)
.block();
}
private Map<String, Object> buildCPUAttackPayload(GremlinCPUAttackRequest request) {
Map<String, Object> payload = new HashMap<>();
payload.put("type", "cpu");
payload.put("target", Map.of(
"type", "Random",
"exact", request.getTargetCount()
));
payload.put("command", Map.of(
"type", "cpu",
"args", List.of(
"-l", String.valueOf(request.getCpuPercentage()),
"-c", String.valueOf(request.getCpuCores())
)
));
return payload;
}
public GremlinAttack createNetworkAttack(GremlinNetworkAttackRequest request) {
return gremlinClient.post()
.uri("/attacks")
.bodyValue(buildNetworkAttackPayload(request))
.retrieve()
.bodyToMono(GremlinAttack.class)
.block();
}
private Map<String, Object> buildNetworkAttackPayload(GremlinNetworkAttackRequest request) {
Map<String, Object> payload = new HashMap<>();
payload.put("type", "network");
payload.put("target", Map.of(
"type", "Container",
"container_selection", Map.of(
"labels", request.getTargetLabels()
)
));
Map<String, Object> command = new HashMap<>();
command.put("type", "latency");
command.put("args", List.of(
"-m", String.valueOf(request.getLatencyMs()),
"-j", String.valueOf(request.getJitterMs())
));
payload.put("command", command);
return payload;
}
}
2. Experiment Scheduling
java
// Scheduled Chaos Experiments
@Component
public class ScheduledChaosExperiments {
@Autowired
private GremlinChaosService gremlinService;
@Autowired
private ChaosExperimentService experimentService;
// Her Salı 10:00'da mikroservis dayanıklılığı testi
@Scheduled(cron = "0 0 10 * * TUE")
public void weeklyMicroserviceResilienceTest() {
if (!isChaosExperimentSafe()) {
log.info("Skipping chaos experiment - unsafe conditions detected");
return;
}
ExperimentPlan plan = ExperimentPlan.builder()
.name("Weekly Microservice Resilience Test")
.description("Test microservice resilience against random failures")
.experiments(List.of(
createServiceTerminationExperiment(),
createNetworkLatencyExperiment(),
createDatabaseConnectionExperiment()
))
.build();
executeExperimentPlan(plan);
}
// Her Perşembe 14:00'da database failover testi
@Scheduled(cron = "0 0 14 * * THU")
public void databaseFailoverTest() {
if (!isDatabaseFailoverTestSafe()) {
log.info("Skipping database failover test - unsafe conditions");
return;
}
DatabaseFailoverExperiment experiment = DatabaseFailoverExperiment.builder()
.targetDatabase("user-service-db")
.failoverType(FailoverType.PRIMARY_SHUTDOWN)
.duration(Duration.ofMinutes(5))
.expectedBehavior("Application should failover to secondary database")
.build();
experimentService.executeDatabaseFailoverExperiment(experiment);
}
private boolean isChaosExperimentSafe() {
// Production load kontrolü
double currentLoad = systemMetricsService.getCurrentCPUUtilization();
if (currentLoad > 70.0) {
return false;
}
// Ongoing incidents kontrolü
long ongoingIncidents = incidentService.getActiveIncidentCount();
if (ongoingIncidents > 0) {
return false;
}
// Deployment window kontrolü
boolean isDeploymentWindow = deploymentService.isDeploymentInProgress();
if (isDeploymentWindow) {
return false;
}
return true;
}
}
Chaos Engineering Best Practices
1. Safe Failure Injection
java
// Safety Controls
@Component
public class ChaosSafetyController {
public boolean validateExperimentSafety(ChaosExperiment experiment) {
List<SafetyCheck> checks = List.of(
new LoadLevelCheck(),
new ActiveIncidentCheck(),
new DeploymentWindowCheck(),
new BusinessHoursCheck(),
new ResourceAvailabilityCheck()
);
for (SafetyCheck check : checks) {
SafetyCheckResult result = check.execute(experiment);
if (!result.isSafe()) {
log.warn("Safety check failed: {} - {}",
check.getName(), result.getReason());
return false;
}
}
return true;
}
@Component
public static class LoadLevelCheck implements SafetyCheck {
@Override
public SafetyCheckResult execute(ChaosExperiment experiment) {
double currentCPU = systemMetrics.getCurrentCPUUtilization();
double currentMemory = systemMetrics.getCurrentMemoryUtilization();
if (currentCPU > 80.0 || currentMemory > 80.0) {
return SafetyCheckResult.unsafe(
String.format("High resource utilization: CPU=%.1f%%, Memory=%.1f%%",
currentCPU, currentMemory)
);
}
return SafetyCheckResult.safe();
}
}
}
2. Blast Radius Control
java
// Blast Radius Management
@Component
public class BlastRadiusController {
public BlastRadiusAssessment assessBlastRadius(ChaosExperiment experiment) {
BlastRadiusAssessment assessment = new BlastRadiusAssessment();
// Service dependency analizi
Set<String> affectedServices = analyzeServiceDependencies(experiment.getTargetService());
assessment.setAffectedServices(affectedServices);
// User impact analizi
UserImpactAssessment userImpact = analyzeUserImpact(affectedServices);
assessment.setUserImpact(userImpact);
// Business process impact
BusinessProcessImpact businessImpact = analyzeBusinessProcessImpact(affectedServices);
assessment.setBusinessImpact(businessImpact);
// Risk level hesaplama
RiskLevel riskLevel = calculateRiskLevel(assessment);
assessment.setRiskLevel(riskLevel);
return assessment;
}
private Set<String> analyzeServiceDependencies(String targetService) {
// Service mesh'ten dependency graph'ı al
ServiceDependencyGraph graph = serviceMeshService.getDependencyGraph();
// Downstream dependencies
Set<String> downstreamServices = graph.getDownstreamServices(targetService);
// Critical path analysis
Set<String> criticalPathServices = graph.getCriticalPathServices(targetService);
Set<String> allAffected = new HashSet<>();
allAffected.addAll(downstreamServices);
allAffected.addAll(criticalPathServices);
return allAffected;
}
public boolean isBlastRadiusAcceptable(BlastRadiusAssessment assessment) {
// Risk level kontrolü
if (assessment.getRiskLevel() == RiskLevel.HIGH) {
return false;
}
// User impact kontrolü
if (assessment.getUserImpact().getAffectedUserPercentage() > 10.0) {
return false;
}
// Business impact kontrolü
if (assessment.getBusinessImpact().getRevenueImpactPercentage() > 5.0) {
return false;
}
return true;
}
}
3. Monitoring ve Observability
java
// Chaos Experiment Monitoring
@Component
public class ChaosExperimentMonitor {
@EventListener
public void onExperimentStarted(ChaosExperimentStartedEvent event) {
ChaosExperiment experiment = event.getExperiment();
// Monitoring setup
setupExperimentMonitoring(experiment);
// Alert suppression (experiment süresince)
alertSuppressionService.suppressAlerts(
experiment.getTargetService(),
experiment.getDuration()
);
// Stakeholder notification
notificationService.notifyExperimentStart(experiment);
}
private void setupExperimentMonitoring(ChaosExperiment experiment) {
MonitoringConfiguration config = MonitoringConfiguration.builder()
.targetService(experiment.getTargetService())
.metricsToTrack(List.of(
"response_time_p95",
"error_rate",
"throughput",
"cpu_utilization",
"memory_utilization"
))
.samplingInterval(Duration.ofSeconds(10))
.duration(experiment.getDuration())
.build();
experimentMonitoringService.startMonitoring(experiment.getId(), config);
}
@EventListener
public void onExperimentCompleted(ChaosExperimentCompletedEvent event) {
ChaosExperiment experiment = event.getExperiment();
// Monitoring cleanup
experimentMonitoringService.stopMonitoring(experiment.getId());
// Alert suppression cleanup
alertSuppressionService.removeSuppressions(experiment.getTargetService());
// Results analysis
ExperimentAnalysis analysis = analyzeExperimentResults(experiment);
// Report generation
generateExperimentReport(experiment, analysis);
}
private ExperimentAnalysis analyzeExperimentResults(ChaosExperiment experiment) {
List<ServiceMetrics> metrics = experimentMonitoringService.getMetrics(experiment.getId());
ExperimentAnalysis analysis = new ExperimentAnalysis();
// Performance impact analysis
PerformanceImpact performanceImpact = analyzePerformanceImpact(metrics);
analysis.setPerformanceImpact(performanceImpact);
// System behavior analysis
SystemBehavior behavior = analyzeSystemBehavior(metrics);
analysis.setSystemBehavior(behavior);
// Recovery analysis
RecoveryAnalysis recovery = analyzeRecoveryBehavior(metrics, experiment);
analysis.setRecoveryAnalysis(recovery);
return analysis;
}
}
Chaos Engineering Metrics
1. Experiment Success Metrics
java
// Chaos Metrics Collection
@Component
public class ChaosMetricsCollector {
private final MeterRegistry meterRegistry;
public ChaosMetricsCollector(MeterRegistry meterRegistry) {
this.meterRegistry = meterRegistry;
}
public void recordExperimentExecution(ChaosExperiment experiment, ExperimentResult result) {
// Experiment count
Counter.builder("chaos_experiments_total")
.description("Total number of chaos experiments executed")
.tag("service", experiment.getTargetService())
.tag("type", experiment.getMethod().getType())
.tag("status", result.isSuccess() ? "success" : "failure")
.register(meterRegistry)
.increment();
// Experiment duration
Timer.builder("chaos_experiment_duration_seconds")
.description("Duration of chaos experiments")
.tag("service", experiment.getTargetService())
.tag("type", experiment.getMethod().getType())
.register(meterRegistry)
.record(experiment.getDuration());
// MTTR during experiment
if (result.getIncidentsDetected() > 0) {
Gauge.builder("chaos_experiment_mttr_seconds")
.description("Mean time to recovery during chaos experiment")
.tag("service", experiment.getTargetService())
.register(meterRegistry, result, r -> r.getMeanTimeToRecovery().getSeconds());
}
// System resilience score
Gauge.builder("chaos_system_resilience_score")
.description("System resilience score based on chaos experiments")
.tag("service", experiment.getTargetService())
.register(meterRegistry, result, r -> r.getResilienceScore());
}
public void recordBlastRadiusMetrics(BlastRadiusAssessment assessment) {
Gauge.builder("chaos_blast_radius_affected_services")
.description("Number of services affected by chaos experiment")
.register(meterRegistry, assessment, a -> a.getAffectedServices().size());
Gauge.builder("chaos_blast_radius_user_impact_percentage")
.description("Percentage of users affected by chaos experiment")
.register(meterRegistry, assessment, a -> a.getUserImpact().getAffectedUserPercentage());
}
}
2. Resilience Scoring
java
// System Resilience Scorer
@Component
public class SystemResilienceScorer {
public ResilienceScore calculateResilienceScore(String serviceName, Duration period) {
List<ChaosExperiment> experiments = getExperimentsForPeriod(serviceName, period);
if (experiments.isEmpty()) {
return ResilienceScore.noData();
}
double recoveryScore = calculateRecoveryScore(experiments);
double performanceScore = calculatePerformanceScore(experiments);
double availabilityScore = calculateAvailabilityScore(experiments);
double errorHandlingScore = calculateErrorHandlingScore(experiments);
double overallScore = (recoveryScore + performanceScore + availabilityScore + errorHandlingScore) / 4.0;
return ResilienceScore.builder()
.serviceName(serviceName)
.period(period)
.overallScore(overallScore)
.recoveryScore(recoveryScore)
.performanceScore(performanceScore)
.availabilityScore(availabilityScore)
.errorHandlingScore(errorHandlingScore)
.experimentCount(experiments.size())
.calculatedAt(Instant.now())
.build();
}
private double calculateRecoveryScore(List<ChaosExperiment> experiments) {
List<Duration> recoveryTimes = experiments.stream()
.filter(exp -> exp.getResults() != null)
.flatMap(exp -> exp.getResults().stream())
.filter(result -> result.getRecoveryTime() != null)
.map(ExperimentResult::getRecoveryTime)
.collect(Collectors.toList());
if (recoveryTimes.isEmpty()) {
return 0.0;
}
Duration averageRecoveryTime = calculateAverage(recoveryTimes);
// Score: 100 için <30s, 0 için >300s
double seconds = averageRecoveryTime.getSeconds();
if (seconds <= 30) {
return 100.0;
} else if (seconds >= 300) {
return 0.0;
} else {
return 100.0 - ((seconds - 30) / 270.0) * 100.0;
}
}
}
Continuous Chaos Engineering
1. Chaos as Code
yaml
# chaos-experiment-config.yaml
apiVersion: chaos.engineering/v1
kind: ChaosExperiment
metadata:
name: user-service-resilience-test
namespace: production
spec:
schedule: "0 10 * * TUE" # Her Salı 10:00
hypothesis:
description: "User service should maintain <200ms P95 latency during pod failures"
steadyStateMetrics:
- name: "p95_latency"
query: "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"user-service\"}[5m]))"
threshold: 0.2
- name: "error_rate"
query: "rate(http_requests_total{service=\"user-service\",status=~\"5..\"}[5m])"
threshold: 0.01
experiment:
type: "pod-failure"
target:
namespace: "production"
labelSelector:
app: "user-service"
configuration:
killPercentage: 25
duration: "5m"
safety:
minReplicas: 3
maxConcurrentExperiments: 1
businessHoursOnly: true
excludeWeekends: true
2. GitOps Integration
java
// Chaos Experiment GitOps Controller
@Component
public class ChaosExperimentController {
@EventListener
public void onChaosConfigurationChange(GitRepositoryChangeEvent event) {
if (!event.getChangedFiles().stream()
.anyMatch(file -> file.startsWith("chaos-experiments/"))) {
return;
}
List<ChaosExperimentConfig> configs = parseConfigFiles(event.getChangedFiles());
for (ChaosExperimentConfig config : configs) {
if (config.isDeleted()) {
deactivateExperiment(config.getName());
} else {
deployOrUpdateExperiment(config);
}
}
}
private void deployOrUpdateExperiment(ChaosExperimentConfig config) {
// Validation
validateExperimentConfig(config);
// Safety checks
if (!chaosConfigValidator.isConfigSafe(config)) {
throw new UnsafeExperimentConfigException(
"Experiment configuration is not safe: " + config.getName()
);
}
// Deploy experiment
ChaosExperiment experiment = chaosExperimentFactory.createFromConfig(config);
chaosExperimentRepository.save(experiment);
// Schedule execution
chaosScheduler.scheduleExperiment(experiment);
log.info("Chaos experiment deployed: {}", config.getName());
}
}
Chaos Engineering, sistem dayanıklılığını proaktif olarak test etmenin ve iyileştirmenin güçlü bir yoludur. Doğru implementasyon ile beklenmedik durumlar karşısında daha güvenilir sistemler inşa edebilirsiniz.