|
| 1 | +--- |
| 2 | +layout:pattern |
| 3 | +title:Circuit Breaker |
| 4 | +folder:circuit-breaker |
| 5 | +permalink:/patterns/circuit-breaker/ |
| 6 | +categories:Behavioral |
| 7 | +tags: |
| 8 | + -Performance |
| 9 | + -Decoupling |
| 10 | + -Cloud distributed |
| 11 | +--- |
| 12 | + |
| 13 | +##含义 |
| 14 | + |
| 15 | +以这样的方式(译者:指断路器方式)处理昂贵的远程服务调用,可以防止单个服务/组件的故障导致整个应用程序崩溃,同时我们可以尽快地进行服务重连。 |
| 16 | + |
| 17 | +##解释 |
| 18 | + |
| 19 | +现实世界案例 |
| 20 | + |
| 21 | +>设想一下,一个网络应用程序既有本地文件/图像,又有用于获取数据的远程服务。这些远程服务可能在某些时候是健康的、有反应的,也可能在某些时候由于各种原因而变得缓慢和无反应。因此,如果其中一个远程服务速度慢或不能成功响应,我们的应用程序将尝试使用多个线程/进程从远程服务中获取响应,很快所有的线程/进程都会挂起(也称为线程饥饿 thread starvation),从而导致我们整个 Web 应用程序崩溃。我们应该能够检测到这种情况,并向用户显示一个适当的信息,以便用户可以探索应用程序的其他部分,而不受远程服务故障的影响。同时,其他正常工作的服务应该保持运作,不受这次故障的影响。 |
| 22 | +
|
| 23 | +简而言之 |
| 24 | + |
| 25 | +>断路器允许优雅地处理失败的远程服务。当我们的应用程序的所有部分都高度解耦时,这种方式的效果会很好,一个组件的失败并不会导致其他部分停止工作。 |
| 26 | +
|
| 27 | +维基百科的解释 |
| 28 | + |
| 29 | +>断路器是现代软件开发中使用的一种设计模式。它用于检测故障,并封装了防止故障不断复发的逻辑,在维护期间,临时地处理外部系统故障或意外的系统问题。 |
| 30 | +
|
| 31 | +##Programmatic Example |
| 32 | + |
| 33 | +那么,这一切是如何实现的呢?考虑到上面的例子,我们将在一个简单的例子中模拟这个功能。一个监控服务(译者:下图的 Monitoring Service)模拟了网络应用,进行本地和远程调用。 |
| 34 | + |
| 35 | +该服务架构如下: |
| 36 | + |
| 37 | + |
| 38 | + |
| 39 | +终端用户(译者:上图的 End User)应用的代码如下: |
| 40 | + |
| 41 | +```java |
| 42 | +@Slf4j |
| 43 | +publicclassApp { |
| 44 | + |
| 45 | +privatestaticfinalLoggerLOGGER=LoggerFactory.getLogger(App.class); |
| 46 | + |
| 47 | +/** |
| 48 | + * Program entry point. |
| 49 | + * |
| 50 | + *@param args command line args |
| 51 | +*/ |
| 52 | +publicstaticvoidmain(String[]args) { |
| 53 | + |
| 54 | +var serverStartTime=System.nanoTime(); |
| 55 | + |
| 56 | +var delayedService=newDelayedRemoteService(serverStartTime,5); |
| 57 | +var delayedServiceCircuitBreaker=newDefaultCircuitBreaker(delayedService,3000,2, |
| 58 | +2000*1000*1000); |
| 59 | + |
| 60 | +var quickService=newQuickRemoteService(); |
| 61 | +var quickServiceCircuitBreaker=newDefaultCircuitBreaker(quickService,3000,2, |
| 62 | +2000*1000*1000); |
| 63 | + |
| 64 | +//Create an object of monitoring service which makes both local and remote calls |
| 65 | +var monitoringService=newMonitoringService(delayedServiceCircuitBreaker, |
| 66 | + quickServiceCircuitBreaker); |
| 67 | + |
| 68 | +//Fetch response from local resource |
| 69 | +LOGGER.info(monitoringService.localResourceResponse()); |
| 70 | + |
| 71 | +//Fetch response from delayed service 2 times, to meet the failure threshold |
| 72 | +LOGGER.info(monitoringService.delayedServiceResponse()); |
| 73 | +LOGGER.info(monitoringService.delayedServiceResponse()); |
| 74 | + |
| 75 | +//Fetch current state of delayed service circuit breaker after crossing failure threshold limit |
| 76 | +//which is OPEN now |
| 77 | +LOGGER.info(delayedServiceCircuitBreaker.getState()); |
| 78 | + |
| 79 | +//Meanwhile, the delayed service is down, fetch response from the healthy quick service |
| 80 | +LOGGER.info(monitoringService.quickServiceResponse()); |
| 81 | +LOGGER.info(quickServiceCircuitBreaker.getState()); |
| 82 | + |
| 83 | +//Wait for the delayed service to become responsive |
| 84 | +try { |
| 85 | +LOGGER.info("Waiting for delayed service to become responsive"); |
| 86 | +Thread.sleep(5000); |
| 87 | + }catch (InterruptedException e) { |
| 88 | + e.printStackTrace(); |
| 89 | + } |
| 90 | +//Check the state of delayed circuit breaker, should be HALF_OPEN |
| 91 | +LOGGER.info(delayedServiceCircuitBreaker.getState()); |
| 92 | + |
| 93 | +//Fetch response from delayed service, which should be healthy by now |
| 94 | +LOGGER.info(monitoringService.delayedServiceResponse()); |
| 95 | +//As successful response is fetched, it should be CLOSED again. |
| 96 | +LOGGER.info(delayedServiceCircuitBreaker.getState()); |
| 97 | + } |
| 98 | +} |
| 99 | +``` |
| 100 | + |
| 101 | +监控服务代码(译者:上图的 monitoring service): |
| 102 | + |
| 103 | +```java |
| 104 | +publicclassMonitoringService { |
| 105 | + |
| 106 | +privatefinalCircuitBreaker delayedService; |
| 107 | + |
| 108 | +privatefinalCircuitBreaker quickService; |
| 109 | + |
| 110 | +publicMonitoringService(CircuitBreakerdelayedService,CircuitBreakerquickService) { |
| 111 | +this.delayedService= delayedService; |
| 112 | +this.quickService= quickService; |
| 113 | + } |
| 114 | + |
| 115 | +//Assumption: Local service won't fail, no need to wrap it in a circuit breaker logic |
| 116 | +publicStringlocalResourceResponse() { |
| 117 | +return"Local Service is working"; |
| 118 | + } |
| 119 | + |
| 120 | +/** |
| 121 | + * Fetch response from the delayed service (with some simulated startup time). |
| 122 | + * |
| 123 | + *@return response string |
| 124 | +*/ |
| 125 | +publicStringdelayedServiceResponse() { |
| 126 | +try { |
| 127 | +returnthis.delayedService.attemptRequest(); |
| 128 | + }catch (RemoteServiceException e) { |
| 129 | +return e.getMessage(); |
| 130 | + } |
| 131 | + } |
| 132 | + |
| 133 | +/** |
| 134 | + * Fetches response from a healthy service without any failure. |
| 135 | + * |
| 136 | + *@return response string |
| 137 | +*/ |
| 138 | +publicStringquickServiceResponse() { |
| 139 | +try { |
| 140 | +returnthis.quickService.attemptRequest(); |
| 141 | + }catch (RemoteServiceException e) { |
| 142 | +return e.getMessage(); |
| 143 | + } |
| 144 | + } |
| 145 | +} |
| 146 | +``` |
| 147 | +可以看出,它直接进行了获取本地资源的调用,但它把对远程(昂贵的)服务的调用包装在一个断路器对象中,这样可以防止出现如下故障: |
| 148 | + |
| 149 | +```java |
| 150 | +publicclassDefaultCircuitBreakerimplementsCircuitBreaker { |
| 151 | + |
| 152 | +privatefinallong timeout; |
| 153 | +privatefinallong retryTimePeriod; |
| 154 | +privatefinalRemoteService service; |
| 155 | +long lastFailureTime; |
| 156 | +privateString lastFailureResponse; |
| 157 | +int failureCount; |
| 158 | +privatefinalint failureThreshold; |
| 159 | +privateState state; |
| 160 | +privatefinallong futureTime=1000*1000*1000*1000; |
| 161 | + |
| 162 | +/** |
| 163 | + * Constructor to create an instance of Circuit Breaker. |
| 164 | + * |
| 165 | + *@param timeout Timeout for the API request. Not necessary for this simple example |
| 166 | + *@param failureThreshold Number of failures we receive from the depended service before changing |
| 167 | + * state to 'OPEN' |
| 168 | + *@param retryTimePeriod Time period after which a new request is made to remote service for |
| 169 | + * status check. |
| 170 | +*/ |
| 171 | +DefaultCircuitBreaker(RemoteServiceserviceToCall,longtimeout,intfailureThreshold, |
| 172 | +longretryTimePeriod) { |
| 173 | +this.service= serviceToCall; |
| 174 | +// We start in a closed state hoping that everything is fine |
| 175 | +this.state=State.CLOSED; |
| 176 | +this.failureThreshold= failureThreshold; |
| 177 | +// Timeout for the API request. |
| 178 | +// Used to break the calls made to remote resource if it exceeds the limit |
| 179 | +this.timeout= timeout; |
| 180 | +this.retryTimePeriod= retryTimePeriod; |
| 181 | +//An absurd amount of time in future which basically indicates the last failure never happened |
| 182 | +this.lastFailureTime=System.nanoTime()+ futureTime; |
| 183 | +this.failureCount=0; |
| 184 | + } |
| 185 | + |
| 186 | +// Reset everything to defaults |
| 187 | +@Override |
| 188 | +publicvoidrecordSuccess() { |
| 189 | +this.failureCount=0; |
| 190 | +this.lastFailureTime=System.nanoTime()+ futureTime; |
| 191 | +this.state=State.CLOSED; |
| 192 | + } |
| 193 | + |
| 194 | +@Override |
| 195 | +publicvoidrecordFailure(Stringresponse) { |
| 196 | + failureCount= failureCount+1; |
| 197 | +this.lastFailureTime=System.nanoTime(); |
| 198 | +// Cache the failure response for returning on open state |
| 199 | +this.lastFailureResponse= response; |
| 200 | + } |
| 201 | + |
| 202 | +// Evaluate the current state based on failureThreshold, failureCount and lastFailureTime. |
| 203 | +protectedvoidevaluateState() { |
| 204 | +if (failureCount>= failureThreshold) {//Then something is wrong with remote service |
| 205 | +if ((System.nanoTime()- lastFailureTime)> retryTimePeriod) { |
| 206 | +//We have waited long enough and should try checking if service is up |
| 207 | + state=State.HALF_OPEN; |
| 208 | + }else { |
| 209 | +//Service would still probably be down |
| 210 | + state=State.OPEN; |
| 211 | + } |
| 212 | + }else { |
| 213 | +//Everything is working fine |
| 214 | + state=State.CLOSED; |
| 215 | + } |
| 216 | + } |
| 217 | + |
| 218 | +@Override |
| 219 | +publicStringgetState() { |
| 220 | + evaluateState(); |
| 221 | +return state.name(); |
| 222 | + } |
| 223 | + |
| 224 | +/** |
| 225 | + * Break the circuit beforehand if it is known service is down Or connect the circuit manually if |
| 226 | + * service comes online before expected. |
| 227 | + * |
| 228 | + *@param state State at which circuit is in |
| 229 | +*/ |
| 230 | +@Override |
| 231 | +publicvoidsetState(Statestate) { |
| 232 | +this.state= state; |
| 233 | +switch (state) { |
| 234 | +caseOPEN: |
| 235 | +this.failureCount= failureThreshold; |
| 236 | +this.lastFailureTime=System.nanoTime(); |
| 237 | +break; |
| 238 | +caseHALF_OPEN: |
| 239 | +this.failureCount= failureThreshold; |
| 240 | +this.lastFailureTime=System.nanoTime()- retryTimePeriod; |
| 241 | +break; |
| 242 | +default: |
| 243 | +this.failureCount=0; |
| 244 | + } |
| 245 | + } |
| 246 | + |
| 247 | +/** |
| 248 | + * Executes service call. |
| 249 | + * |
| 250 | + *@return Value from the remote resource, stale response or a custom exception |
| 251 | +*/ |
| 252 | +@Override |
| 253 | +publicStringattemptRequest()throwsRemoteServiceException { |
| 254 | + evaluateState(); |
| 255 | +if (state==State.OPEN) { |
| 256 | +// return cached response if the circuit is in OPEN state |
| 257 | +returnthis.lastFailureResponse; |
| 258 | + }else { |
| 259 | +// Make the API request if the circuit is not OPEN |
| 260 | +try { |
| 261 | +//In a real application, this would be run in a thread and the timeout |
| 262 | +//parameter of the circuit breaker would be utilized to know if service |
| 263 | +//is working. Here, we simulate that based on server response itself |
| 264 | +var response= service.call(); |
| 265 | +// Yay!! the API responded fine. Let's reset everything. |
| 266 | + recordSuccess(); |
| 267 | +return response; |
| 268 | + }catch (RemoteServiceException ex) { |
| 269 | + recordFailure(ex.getMessage()); |
| 270 | +throw ex; |
| 271 | + } |
| 272 | + } |
| 273 | + } |
| 274 | +} |
| 275 | +``` |
| 276 | + |
| 277 | +上述模式是如何防止失败的呢?让我们通过它所实现的这个有限状态机来了解。 |
| 278 | + |
| 279 | + |
| 280 | + |
| 281 | +- 我们用`timeout`(超时)、`failureThreshold` (失败阈值)、`retryTimePeriod`(重试时间周期) 参数初始化断路器对象 ,用于确定 API 的适应性。 |
| 282 | +- 最初,断路器处于`closed` 关闭状态,没有发生对 API 的远程调用。 |
| 283 | +- 每次调用成功,我们就把状态重置为开始时的样子。 |
| 284 | +- 如果失败的次数超过了一定的阈值(`failureThreshold`),断路器就会进入`open` 开启状态,它的作用就像一个开启的电路,阻止远程服务的调用,从而节省资源。 |
| 285 | +- 一旦我们超过重试时间周期(`retryTimePeriod`),断路器就会转到`half-open` 半启用状态,并再次调用远程服务,检查服务是否正常,以便我们可以提供最新的响应内容。如果远程服务调用失败会使断路器回到`open` 状态,并在重试超时后进行另一次尝试;如果远程服务调用成功则使断路器进入`closed` 状态,这样一切又开始正常工作。 |
| 286 | + |
| 287 | +##类图 |
| 288 | + |
| 289 | + |
| 290 | + |
| 291 | +##适用场景 |
| 292 | + |
| 293 | +在以下场景下,可以使用断路器模式: |
| 294 | + |
| 295 | +- 构建一个高可用的应用程序,某些些服务的失败不会导致整个应用程序的崩溃。 |
| 296 | +- 构建一个持续运行(长期在线)的应用程序,以便其组件可以在不完全关闭的情况下进行升级。 |
| 297 | + |
| 298 | +##相关模式 |
| 299 | + |
| 300 | +-[Retry Pattern](https://github.com/iluwatar/java-design-patterns/tree/master/retry) |
| 301 | + |
| 302 | +##现实案例 |
| 303 | + |
| 304 | +*[Spring Circuit Breaker module](https://spring.io/guides/gs/circuit-breaker) |
| 305 | +*[Netflix Hystrix API](https://github.com/Netflix/Hystrix) |
| 306 | + |
| 307 | +##引用 |
| 308 | + |
| 309 | +*[Understanding Circuit Breaker Pattern](https://itnext.io/understand-circuitbreaker-design-pattern-with-simple-practical-example-92a752615b42) |
| 310 | +*[Martin Fowler on Circuit Breaker](https://martinfowler.com/bliki/CircuitBreaker.html) |
| 311 | +*[Fault tolerance in a high volume, distributed system](https://medium.com/netflix-techblog/fault-tolerance-in-a-high-volume-distributed-system-91ab4faae74a) |
| 312 | +*[Circuit Breaker pattern](https://docs.microsoft.com/en-us/azure/architecture/patterns/circuit-breaker) |