Index: jk/native/common/jk_lb_worker.c =================================================================== RCS file: /home/cvspublic/jakarta-tomcat-connectors/jk/native/common/jk_lb_worker.c,v --- jk/native/common/jk_lb_worker.c 14 Jun 2005 06:34:13 -0000 1.91 +++ jk/native/common/jk_lb_worker.c 11 Aug 2005 03:18:57 -0000 @@ -142,6 +142,17 @@ return result; } +const char *get_method_value(int t) +{ + if (JK_LB_BYREQUESTS == t) + return "by request"; + else if (JK_LB_BYTRAFFIC == t) + return "by traffic"; + else if (JK_LB_BYBUSYNESS == t) + return "by busyness"; + else + return "Unknown"; +} /* Retrieve session id from the cookie or the parameter */ /* (parameter first) */ @@ -229,8 +240,6 @@ jk_u64_t curmin = 0; worker_record_t *candidate = NULL; - if (p->lblock == JK_LB_LOCK_PESSIMISTIC) - jk_shm_lock(); /* First try to see if we have available candidate */ for (i = 0; i < p->num_of_workers; i++) { /* Skip all workers that are not member of domain */ @@ -263,8 +272,6 @@ candidate->s->lb_value -= total_factor; candidate->r = &(candidate->s->domain[0]); } - if (p->lblock == JK_LB_LOCK_PESSIMISTIC) - jk_shm_unlock(); return candidate; } @@ -277,8 +284,6 @@ int total_factor = 0; worker_record_t *candidate = NULL; - if (p->lblock == JK_LB_LOCK_PESSIMISTIC) - jk_shm_lock(); /* First try to see if we have available candidate */ for (i = 0; i < p->num_of_workers; i++) { /* If the worker is in error state run @@ -303,8 +308,6 @@ if (candidate) candidate->s->lb_value -= total_factor; - if (p->lblock == JK_LB_LOCK_PESSIMISTIC) - jk_shm_unlock(); return candidate; } @@ -317,8 +320,6 @@ jk_u64_t curmin = 0; worker_record_t *candidate = NULL; - if (p->lblock == JK_LB_LOCK_PESSIMISTIC) - jk_shm_lock(); /* First try to see if we have available candidate */ for (i = 0; i < p->num_of_workers; i++) { /* If the worker is in error state run @@ -342,8 +343,74 @@ } } } - if (p->lblock == JK_LB_LOCK_PESSIMISTIC) - jk_shm_unlock(); + return candidate; +} + +static worker_record_t *find_best_bybusyness(lb_worker_t *p, + jk_logger_t *l) +{ + static unsigned int next_offset = 0; + unsigned int i; + unsigned int j; + unsigned int offset; + int bfn; /* Numerator of best busy factor */ + int bfd; /* Denominator of best busy factor */ + int curn; /* Numerator of current busy factor */ + int curd; /* Denominator of current busy factor */ + + int left; /* left and right are used to compare rational numbers */ + int right; + + /* find the least busy worker */ + worker_record_t *candidate = NULL; + + offset = next_offset; + + /* First try to see if we have available candidate */ + for (j = 0; j < p->num_of_workers; j++) { + i = (j + offset) % p->num_of_workers; + + /* If the worker is in error state run + * retry on that worker. It will be marked as + * operational if the retry timeout is elapsed. + * The worker might still be unusable, but we try + * anyway. + */ + if (JK_WORKER_IN_ERROR(p->lb_workers[i].s)) { + retry_worker(&p->lb_workers[i], p->s->recover_wait_time, l); + } + /* Take into calculation only the workers that are + * not in error state, stopped or not disabled. + */ + if (JK_WORKER_USABLE(p->lb_workers[i].s)) { + curn = p->lb_workers[i].s->busy; + curd = p->lb_workers[i].s->lb_factor; + + /* If the server is restarted under load there is a bug that causes + * busy to be reset to zero before all the outstanding connections + * finish, they then finally finish. As a result, the busy value + * becomes negative, messing up the busyness load balancing. + * When this bug is fixed, this section can be removed */ + if (curn < 0) { + jk_log(l, JK_LOG_WARNING, + "busy value is %d for worker %s, resetting it to zero", + curn, p->lb_workers[i].s->name); + p->lb_workers[i].s->busy = 0; + curn = 0; + } + + /* compare rational numbers: (a/b) < (c/d) iff a*d < c*b */ + left = curn * bfd; + right = bfn * curd; + + if (!candidate || (left < right)) { + candidate = &p->lb_workers[i]; + bfn = curn; + bfd = curd; + next_offset = i + 1; + } + } + } return candidate; } @@ -386,8 +453,6 @@ } if (candidate && !uses_domain && p->lbmethod == JK_LB_BYREQUESTS) { - if (p->lblock == JK_LB_LOCK_PESSIMISTIC) - jk_shm_lock(); for (i = 0; i < p->num_of_workers; i++) { if (JK_WORKER_USABLE(p->lb_workers[i].s)) { @@ -400,8 +465,6 @@ } } candidate->s->lb_value -= total_factor; - if (p->lblock == JK_LB_LOCK_PESSIMISTIC) - jk_shm_unlock(); } return candidate; } @@ -433,6 +496,9 @@ rc = find_best_byrequests(p, l); else if (p->lbmethod == JK_LB_BYTRAFFIC) rc = find_best_bytraffic(p, l); + else if (p->lbmethod == JK_LB_BYBUSYNESS) + rc = find_best_bybusyness(p, l); + /* By default use worker name as session route */ if (rc) rc->r = &(rc->s->name[0]); @@ -447,6 +513,7 @@ jk_logger_t *l) { worker_record_t *rc = NULL; + int had_session_id = JK_FALSE; char *sessionid = NULL; int r; @@ -483,8 +550,11 @@ JK_TRACE_EXIT(l); return NULL; } + if (p->lblock == JK_LB_LOCK_PESSIMISTIC) + jk_shm_lock(); if (sessionid) { char *session = sessionid; + had_session_id = JK_TRUE; if (JK_IS_DEBUG_LEVEL(l)) { jk_log(l, JK_LOG_DEBUG, "total sessionid is %s", @@ -511,13 +581,8 @@ /* We have a session route. Whow! */ rc = find_bysession_route(p, session_route, l); if (rc) { - JK_LEAVE_CS(&(p->cs), r); - if (JK_IS_DEBUG_LEVEL(l)) - jk_log(l, JK_LOG_DEBUG, - "found worker %s for route %s and partial sessionid %s", - rc->s->name, session_route, sessionid); - JK_TRACE_EXIT(l); - return rc; + rc->s->sticky_session_count++; + break; } } /* Try next partial sessionid if present */ @@ -525,20 +590,31 @@ rc = NULL; } if (!rc && p->s->sticky_session_force) { - JK_LEAVE_CS(&(p->cs), r); jk_log(l, JK_LOG_INFO, "all workers are in error state for session %s", session); - JK_TRACE_EXIT(l); - return NULL; } } - rc = find_best_worker(p, l); + if (!rc && (!had_session_id || !p->s->sticky_session_force)) { + rc = find_best_worker(p, l); + if (rc) rc->s->elected++; + } + if (rc) { + /* Increment the number of workers serving request */ + p->s->busy++; + if (p->s->busy > p->s->max_busy) + p->s->max_busy = p->s->busy; + rc->s->busy++; + if (rc->s->busy > rc->s->max_busy) + rc->s->max_busy = rc->s->busy; + } + if (p->lblock == JK_LB_LOCK_PESSIMISTIC) + jk_shm_unlock(); JK_LEAVE_CS(&(p->cs), r); if (rc && JK_IS_DEBUG_LEVEL(l)) { jk_log(l, JK_LOG_DEBUG, "found best worker (%s) using %s method", rc->s->name, - p->lbmethod == JK_LB_BYREQUESTS ? "by request" : "by traffic"); + get_method_value(p->lbmethod)); } JK_TRACE_EXIT(l); return rc; @@ -572,12 +648,14 @@ worker_record_t *rec = get_most_suitable_worker(p->worker, s, attempt++, l); int rc; + int r; /* Do not reuse previous worker, because * that worker already failed. */ if (rec && rec != prec) { int is_service_error = JK_HTTP_OK; int service_stat = JK_FALSE; + int no_endpoint = JK_TRUE; jk_endpoint_t *end = NULL; s->jvm_route = rec->r; @@ -587,40 +665,37 @@ jk_log(l, JK_LOG_DEBUG, "service worker=%s jvm_route=%s", rec->s->name, s->jvm_route); - rec->s->elected++; if (rc && end) { + no_endpoint = JK_FALSE; + /* Reset endpoint read and write sizes for * this request. */ end->rd = end->wr = 0; - /* Increment the number of workers serving request */ - p->worker->s->busy++; - if (p->worker->s->busy > p->worker->s->max_busy) - p->worker->s->max_busy = p->worker->s->busy; - rec->s->busy++; - if (rec->s->busy > rec->s->max_busy) - rec->s->max_busy = rec->s->busy; + service_stat = end->service(end, s, l, &is_service_error); /* Update partial reads and writes if any */ rec->s->readed += end->rd; rec->s->transferred += end->wr; end->done(&end, l); + /* When returning the endpoint mark the worker as not busy. * We have at least one endpoint free */ rec->s->is_busy = JK_FALSE; - /* Decrement the busy worker count */ - rec->s->busy--; - p->worker->s->busy--; - if (service_stat == JK_TRUE) { - rec->s->in_error_state = JK_FALSE; - rec->s->in_recovering = JK_FALSE; - rec->s->error_time = 0; - JK_TRACE_EXIT(l); - return JK_TRUE; - } } - else { + + JK_ENTER_CS(&(p->worker->cs), r); + if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC) + jk_shm_lock(); + /* Decrement the busy worker count */ + rec->s->busy--; + p->worker->s->busy--; + if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC) + jk_shm_unlock(); + JK_LEAVE_CS(&(p->worker->cs), r); + + if (no_endpoint) { /* If we can not get the endpoint * mark the worker as busy rather then * as in error @@ -634,7 +709,15 @@ prec = rec; continue; } - if (service_stat == JK_FALSE) { + + if (service_stat == JK_TRUE) { + rec->s->in_error_state = JK_FALSE; + rec->s->in_recovering = JK_FALSE; + rec->s->error_time = 0; + JK_TRACE_EXIT(l); + return JK_TRUE; + } + else if (service_stat == JK_FALSE) { /* * Service failed !!! * @@ -699,6 +782,17 @@ "recoverable error... will try to recover on other host"); } else { + if (rec) { + JK_ENTER_CS(&(p->worker->cs), r); + if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC) + jk_shm_lock(); + /* Decrement the busy worker count */ + rec->s->busy--; + p->worker->s->busy--; + if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC) + jk_shm_unlock(); + JK_LEAVE_CS(&(p->worker->cs), r); + } /* NULL record, no more workers left ... */ jk_log(l, JK_LOG_ERROR, "All tomcat instances failed, no more workers left"); Index: jk/native/common/jk_lb_worker.h =================================================================== RCS file: /home/cvspublic/jakarta-tomcat-connectors/jk/native/common/jk_lb_worker.h,v --- jk/native/common/jk_lb_worker.h 15 May 2005 15:22:05 -0000 1.15 +++ jk/native/common/jk_lb_worker.h 11 Aug 2005 03:18:57 -0000 @@ -39,8 +39,10 @@ #define JK_LB_BYREQUESTS (0) #define JK_LB_BYTRAFFIC (1) +#define JK_LB_BYBUSYNESS (2) #define JK_LB_METHOD_REQUESTS ("Request") #define JK_LB_METHOD_TRAFFIC ("Traffic") +#define JK_LB_METHOD_BUSYNESS ("Busyness") #define JK_LB_LOCK_DEFAULT (0) #define JK_LB_LOCK_PESSIMISTIC (1) #define JK_LB_LM_DEFAULT ("Optimistic") Index: jk/native/common/jk_shm.h =================================================================== RCS file: /home/cvspublic/jakarta-tomcat-connectors/jk/native/common/jk_shm.h,v --- jk/native/common/jk_shm.h 14 Jun 2005 06:34:13 -0000 1.22 +++ jk/native/common/jk_shm.h 11 Aug 2005 03:18:57 -0000 @@ -86,6 +86,8 @@ volatile jk_u64_t transferred; /* Number of times the worker was elected */ volatile size_t elected; + /* Number of times the worker was reused from sticky session */ + volatile size_t sticky_session_count; /* Number of non 200 responses */ volatile size_t errors; }; Index: jk/native/common/jk_status.c =================================================================== RCS file: /home/cvspublic/jakarta-tomcat-connectors/jk/native/common/jk_status.c,v --- jk/native/common/jk_status.c 14 Jun 2005 14:31:24 -0000 1.44 +++ jk/native/common/jk_status.c 11 Aug 2005 03:18:58 -0000 @@ -83,6 +83,14 @@ NULL }; +static const char *lb_method_type[] = { + JK_LB_METHOD_REQUESTS, + JK_LB_METHOD_TRAFFIC, + JK_LB_METHOD_BUSYNESS, + "unknown", + NULL +}; + static const char *headers_names[] = { "Content-Type", "Cache-Control", @@ -200,6 +208,13 @@ } while (1); } +static const char *status_lb_method_type(int t) +{ + if (t < 0 || t > 2) + t = 3; + return lb_method_type[t]; +} + static const char *status_worker_type(int t) { if (t < 0 || t > 6) @@ -443,12 +458,12 @@ jk_putv(s, "
Name | Type | Host | Addr | " - "Stat | F | V | Acc | Err | " + "Stat | F | V | Acc | Ssc | Err | " "Wr | Rd | Busy | Max | RR | Cd | %d | ", wr->s->lb_factor); jk_printf(s, "%d | ", wr->s->lb_value); jk_printf(s, "%u | ", wr->s->elected); + jk_printf(s, "%u | ", wr->s->sticky_session_count); jk_printf(s, "%u | ", wr->s->errors); jk_putv(s, "", status_strfsize(wr->s->transferred, buf), " | ", NULL); @@ -580,7 +596,8 @@ "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Stat | Worker status | |||||||||||||||||||
F | Load Balancer Factor | |||||||||||||||||||
V | Load Balancer Value | |||||||||||||||||||
Acc | Number of requests | |||||||||||||||||||
Acc | Number of new requests | |||||||||||||||||||
Ssc | Number of sticky session requests | |||||||||||||||||||
Err | Number of failed requests | |||||||||||||||||||
Wr | Number of bytes transferred | |||||||||||||||||||
Rd | Number of bytes read |