diff --git a/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java b/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java index e3f12ff..a268d4b 100644 --- a/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java +++ b/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java @@ -27,6 +27,8 @@ import javax.servlet.http.HttpSession; import javax.servlet.http.HttpSessionBindingEvent; import javax.servlet.http.HttpSessionBindingListener; +import org.apache.catalina.Context; +import org.apache.catalina.Host; import org.apache.catalina.LifecycleException; import org.apache.catalina.connector.Request; import org.apache.catalina.connector.Response; @@ -44,8 +46,8 @@ public class CrawlerSessionManagerValve extends ValveBase implements HttpSession private static final Log log = LogFactory.getLog(CrawlerSessionManagerValve.class); - private final Map clientIpSessionId = new ConcurrentHashMap<>(); - private final Map sessionIdClientIp = new ConcurrentHashMap<>(); + private final Map clientIdSessionId = new ConcurrentHashMap<>(); + private final Map sessionIdClientId = new ConcurrentHashMap<>(); private String crawlerUserAgents = ".*[bB]ot.*|.*Yahoo! Slurp.*|.*Feedfetcher-Google.*"; private Pattern uaPattern = null; @@ -55,6 +57,10 @@ public class CrawlerSessionManagerValve extends ValveBase implements HttpSession private int sessionInactiveInterval = 60; + private boolean isHostAware = true; + + private boolean isContextAware = true; + /** * Specifies a default constructor so async support can be configured. @@ -134,7 +140,27 @@ public class CrawlerSessionManagerValve extends ValveBase implements HttpSession public Map getClientIpSessionId() { - return clientIpSessionId; + return clientIdSessionId; + } + + + public boolean isHostAware() { + return isHostAware; + } + + + public void setHostAware(boolean isHostAware) { + this.isHostAware = isHostAware; + } + + + public boolean isContextAware() { + return isContextAware; + } + + + public void setContextAware(boolean isContextAware) { + this.isContextAware = isContextAware; } @@ -152,9 +178,10 @@ public class CrawlerSessionManagerValve extends ValveBase implements HttpSession boolean isBot = false; String sessionId = null; String clientIp = request.getRemoteAddr(); + String clientIdentifier = getClientIdentifier(request.getHost(), request.getContext(), clientIp); if (log.isDebugEnabled()) { - log.debug(request.hashCode() + ": ClientIp=" + clientIp + ", RequestedSessionId=" + log.debug(request.hashCode() + ": ClientIdentifier=" + clientIdentifier + ", RequestedSessionId=" + request.getRequestedSessionId()); } @@ -194,7 +221,7 @@ public class CrawlerSessionManagerValve extends ValveBase implements HttpSession // If this is a bot, is the session ID known? if (isBot) { - sessionId = clientIpSessionId.get(clientIp); + sessionId = clientIdSessionId.get(clientIdentifier); if (sessionId != null) { request.setRequestedSessionId(sessionId); if (log.isDebugEnabled()) { @@ -211,8 +238,8 @@ public class CrawlerSessionManagerValve extends ValveBase implements HttpSession // Has bot just created a session, if so make a note of it HttpSession s = request.getSession(false); if (s != null) { - clientIpSessionId.put(clientIp, s.getId()); - sessionIdClientIp.put(s.getId(), clientIp); + clientIdSessionId.put(clientIdentifier, s.getId()); + sessionIdClientId.put(s.getId(), clientIdentifier); // #valueUnbound() will be called on session expiration s.setAttribute(this.getClass().getName(), this); s.setMaxInactiveInterval(sessionInactiveInterval); @@ -231,11 +258,23 @@ public class CrawlerSessionManagerValve extends ValveBase implements HttpSession } + private String getClientIdentifier(Host host, Context context, String clientIp) { + StringBuilder result = new StringBuilder(clientIp); + if (isHostAware) { + result.append('-').append(host.getName()); + } + if (isContextAware) { + result.append(context.getName()); + } + return result.toString(); + } + + @Override public void valueUnbound(HttpSessionBindingEvent event) { - String clientIp = sessionIdClientIp.remove(event.getSession().getId()); - if (clientIp != null) { - clientIpSessionId.remove(clientIp); + String clientIdentifier = sessionIdClientId.remove(event.getSession().getId()); + if (clientIdentifier != null) { + clientIdSessionId.remove(clientIdentifier); } } } diff --git a/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java b/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java index edc562f..3a3e883 100644 --- a/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java +++ b/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java @@ -16,12 +16,16 @@ */ package org.apache.catalina.valves; +import java.io.IOException; +import java.util.Arrays; import java.util.Collections; +import javax.servlet.ServletException; import javax.servlet.http.HttpSession; import org.junit.Test; - +import org.apache.catalina.Context; +import org.apache.catalina.Host; import org.apache.catalina.Valve; import org.apache.catalina.connector.Request; import org.apache.catalina.connector.Response; @@ -34,6 +38,7 @@ public class TestCrawlerSessionManagerValve { public void testCrawlerIpsPositive() throws Exception { CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve(); valve.setCrawlerIps("216\\.58\\.206\\.174"); + valve.setCrawlerUserAgents(valve.getCrawlerUserAgents()); valve.setNext(EasyMock.createMock(Valve.class)); HttpSession session = createSessionExpectations(valve, true); Request request = createRequestExpectations("216.58.206.174", session, true); @@ -49,6 +54,7 @@ public class TestCrawlerSessionManagerValve { public void testCrawlerIpsNegative() throws Exception { CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve(); valve.setCrawlerIps("216\\.58\\.206\\.174"); + valve.setCrawlerUserAgents(valve.getCrawlerUserAgents()); valve.setNext(EasyMock.createMock(Valve.class)); HttpSession session = createSessionExpectations(valve, false); Request request = createRequestExpectations("127.0.0.1", session, false); @@ -60,6 +66,32 @@ public class TestCrawlerSessionManagerValve { EasyMock.verify(request, session); } + @Test + public void testCrawlerMultipleHostsHostAware() throws Exception { + CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve(); + valve.setCrawlerUserAgents(valve.getCrawlerUserAgents()); + valve.setHostAware(true); + valve.setContextAware(true); + valve.setNext(EasyMock.createMock(Valve.class)); + + verifyCrawlingLocalhost(valve, "localhost"); + verifyCrawlingLocalhost(valve, "example.invalid"); + } + + + private void verifyCrawlingLocalhost(CrawlerSessionManagerValve valve, String hostname) + throws IOException, ServletException { + HttpSession session = createSessionExpectations(valve, true); + Request request = createRequestExpectations("127.0.0.1", session, true, hostname, "tomcatBot 1.0"); + + EasyMock.replay(request, session); + + valve.invoke(request, EasyMock.createMock(Response.class)); + + EasyMock.verify(request, session); + } + + private HttpSession createSessionExpectations(CrawlerSessionManagerValve valve, boolean isBot) { HttpSession session = EasyMock.createMock(HttpSession.class); if (isBot) { @@ -72,15 +104,36 @@ public class TestCrawlerSessionManagerValve { return session; } + private Request createRequestExpectations(String ip, HttpSession session, boolean isBot) { + return createRequestExpectations(ip, session, isBot, "localhost", "something 1.0"); + } + + private Request createRequestExpectations(String ip, HttpSession session, boolean isBot, String hostname, String userAgent) { Request request = EasyMock.createMock(Request.class); EasyMock.expect(request.getRemoteAddr()).andReturn(ip); + EasyMock.expect(request.getHost()).andReturn(simpleHostWithName(hostname)); + EasyMock.expect(request.getContext()).andReturn(simpleContextWithName()); IExpectationSetters setter = EasyMock.expect(request.getSession(false)) .andReturn(null); if (isBot) { setter.andReturn(session); } - EasyMock.expect(request.getHeaders("user-agent")).andReturn(Collections.emptyEnumeration()); + EasyMock.expect(request.getHeaders("user-agent")).andAnswer(() -> Collections.enumeration(Arrays.asList(userAgent))); return request; } + + private Host simpleHostWithName(String hostname) { + Host host = EasyMock.createMock(Host.class); + EasyMock.expect(host.getName()).andReturn(hostname); + EasyMock.replay(host); + return host; + } + + private Context simpleContextWithName() { + Context context = EasyMock.createMock(Context.class); + EasyMock.expect(context.getName()).andReturn("/examples"); + EasyMock.replay(context); + return context; + } } diff --git a/webapps/docs/config/valve.xml b/webapps/docs/config/valve.xml index 97b0679..bde7d7d 100644 --- a/webapps/docs/config/valve.xml +++ b/webapps/docs/config/valve.xml @@ -1820,6 +1820,13 @@

+ +

Flag to use the context name together with the client IP to + identify the session to re-use. Can be combined with hostAware. + Default value: true +

+
+

Regular expression (using java.util.regex) that client IP is matched against to determine if a request is from a web crawler. @@ -1833,6 +1840,13 @@ .*[bB]ot.*|.*Yahoo! Slurp.*|.*Feedfetcher-Google.* is used.

+ +

Flag to use the configured host together with the client IP to + identify the session to re-use. Can be combined with contextAware. + Default value: true +

+
+

The minimum time in seconds that the Crawler Session Manager Valve should keep the mapping of client IP to session ID in memory without any