How to reproduce: Tomcat with the NIO connector, using HTTPS scheme, when a client requests for some quite a big (~100K-200K) static file, and breaks the connection unexpectedly. Here is the python script to reproduce the problem: # The way to reproduce the bug in the Tomcat NIO connector. # Install python-iptables from https://github.com/ldx/python-iptables/downloads # Author: Dmitry Kukushkin (dmitry.kukushkin at external.telekom.de) from threading import Thread from threading import Lock from socket import * from select import * from time import * from traceback import * import ssl import sys import iptc blockedPorts = dict() getCssRequest = """GET /<some big file> HTTP/1.1\r Host: <host>:<port>\r User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 5_0_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Mobile/9A406\r Accept: text/css,*/*;q=0.1\r Accept-Language: de-de\r Accept-Encoding: gzip, deflate\r Connection: keep-alive\r\n \r\n """ class Client(Thread): def __init__(self, tid, lock): Thread.__init__(self) self.lock = lock self.tid = tid def run(self): print "Starting thread %d" % self.tid try: clientSocket = socket(AF_INET, SOCK_STREAM) sslSocket = ssl.wrap_socket(clientSocket) remoteAddr = (<host>, <port>) sslSocket.connect(remoteAddr) localAddr, localPort = sslSocket.getsockname() print "New socket created tid=%d, sfd=%d, sport=%d" % ( self.tid, clientSocket.fileno(), localPort ) self.deleteFirewallRule(localPort, self.lock) sslSocket.send(getCssRequest) data = sslSocket.recv(1024) ''' Put the socket into half - closed state ''' clientSocket.shutdown(SHUT_WR) clientSocket.close() self.createFirewallRule(localPort, self.lock) except Exception, e: print "Error: ", e print_exc() @staticmethod def createFirewallRule(port, lock): lock.acquire() print "Creating the iptables rule for port %d" % port rule = iptc.Rule() rule.protocol = "tcp" rule.target = iptc.Target(rule, "REJECT") match = iptc.Match(rule, "tcp") match.sport = "%s" % port rule.add_match(match) chain = iptc.Chain(iptc.TABLE_FILTER, "OUTPUT") chain.insert_rule(rule) rule.target.reset() blockedPorts[port] = rule lock.release() @staticmethod def deleteFirewallRule(port, lock): lock.acquire() if port in blockedPorts: print "Deleteng the iptables rule for port %d" % port rule = blockedPorts[port] chain = iptc.Chain(iptc.TABLE_FILTER, "OUTPUT") chain.delete_rule(rule) chain.flush() del blockedPorts[port] lock.release() if __name__ == "__main__": if len(sys.argv) == 1 : print "Problem.py <number of threads>" exit(0) clients = [] lock = Lock() for i in range( int(sys.argv[1]) ): c = Client(i, lock) clients.append(c) c.start() print "Joining" for i in clients: i.join() The client shutdowns the socket, thus making connection half-closed, and disappears (is blocked by firewall in case of this script). This results in a high CPU load: Servers polls the socket descriptor for both reading and writing, and gets EPOLLIN event on this half-closed connection, then it is trying to write, failing with EAGAIN, polling sd for writing and reading events again, getting EPOLLIN, trying to write... and so on, as you can see from the attached strace snippet: ... [pid 26206] epoll_ctl(50, EPOLL_CTL_DEL, 51, {0, {u32=51, u64=34359738419}}) = 0 [pid 26206] gettimeofday({1331211762, 650946}, NULL) = 0 [pid 26206] epoll_wait(50, <unfinished ...> [pid 26199] <... epoll_wait resumed> {{EPOLLIN, {u32=40, u64=7266968735074746408}}}, 4096, 1000) = 1 [pid 26199] read(40, "\1", 128) = 1 [pid 26199] gettimeofday({1331211762, 652745}, NULL) = 0 [pid 26199] epoll_wait(43, <unfinished ...> [pid 26206] <... epoll_wait resumed> {{EPOLLIN, {u32=48, u64=6182845940760624}}}, 4096, 1000) = 1 [pid 26206] read(48, "\1", 128) = 1 [pid 26206] gettimeofday({1331211762, 675214}, NULL) = 0 [pid 26206] gettimeofday({1331211762, 675294}, NULL) = 0 [pid 26206] gettimeofday({1331211762, 675378}, NULL) = 0 [pid 26206] epoll_ctl(50, EPOLL_CTL_ADD, 51, {EPOLLOUT, {u32=51, u64=51}}) = 0 [pid 26206] epoll_ctl(50, EPOLL_CTL_MOD, 51, {EPOLLIN|EPOLLOUT, {u32=51, u64=7255244058472218675}}) = 0 [pid 26206] gettimeofday({1331211762, 675570}, NULL) = 0 [pid 26206] epoll_wait(50, {{EPOLLIN, {u32=51, u64=7255244058472218675}}}, 4096, 1000) = 1 [pid 26206] gettimeofday({1331211762, 675699}, NULL) = 0 [pid 26206] gettimeofday({1331211762, 675761}, NULL) = 0 [pid 26206] write(51, "\272\270\301c\226\341\272o-<\233\370|\223\313\241\177\215*\236\201\327YI5u\210J*A\371z"..., 116) = -1 EAGAIN (Resource temporarily unavailable) [pid 26206] gettimeofday({1331211762, 675925}, NULL) = 0 [pid 26206] gettimeofday({1331211762, 675979}, NULL) = 0 [pid 26206] epoll_wait(50, {{EPOLLIN, {u32=51, u64=7255244058472218675}}}, 4096, 1000) = 1 [pid 26206] gettimeofday({1331211762, 676082}, NULL) = 0 [pid 26206] gettimeofday({1331211762, 676144}, NULL) = 0 [pid 26206] write(51, "\272\270\301c\226\341\272o-<\233\370|\223\313\241\177\215*\236\201\327YI5u\210J*A\371z"..., 116) = -1 EAGAIN (Resource temporarily unavailable) [pid 26206] gettimeofday({1331211762, 676273}, NULL) = 0 [pid 26206] gettimeofday({1331211762, 676327}, NULL) = 0 [pid 26206] epoll_wait(50, {{EPOLLIN, {u32=51, u64=7255244058472218675}}}, 4096, 1000) = 1 [pid 26206] gettimeofday({1331211762, 676429}, NULL) = 0 [pid 26206] gettimeofday({1331211762, 676476}, NULL) = 0 [pid 26206] write(51, "\272\270\301c\226\341\272o-<\233\370|\223\313\241\177\215*\236\201\327YI5u\210J*A\371z"..., 116) = -1 EAGAIN (Resource temporarily unavailable) [pid 26206] gettimeofday({1331211762, 676609}, NULL) = 0 [pid 26206] gettimeofday({1331211762, 676671}, NULL) = 0 ... Possible solution: Check if EPOLLOUT event has arrived before attempting to send the file: --- apache-tomcat-7.0.26-src/java/org/apache/tomcat/util/net/NioEndpoint.java +++ apache-tomcat-7.0.26-src/java/org/apache/tomcat/util/net/NioEndpoint.java @@ -1205,7 +1205,7 @@ sk.attach(attachment);//cant remember why this is here NioChannel channel = attachment.getChannel(); if (sk.isReadable() || sk.isWritable() ) { - if ( attachment.getSendfileData() != null ) { + if ( sk.isWritable() && attachment.getSendfileData() != null ) { processSendfile(sk,attachment,true, false); } else if ( attachment.getComet() ) { //check if thread is available
Thanks for the test case. It made investigating this issue a whole lot easier. I fixed this with a slightly different patch that addresses the root cause - i.e. not unregistering the interest in OP_READ while the sendfile write is in progress. This has been fixed in trunk and 7.0.x and will be included in 7.0.27 onwards.
I think this fix is incorrect. Due to https://issues.apache.org/bugzilla/show_bug.cgi?id=53138
Fixed in trunk in r1340215 Fixed in 7.0.x in r1340218 Will be available in 7.0.28
Fixed in 6.0 by r1372035 and will be in 6.0.36.