blob: 645864c103de571048eb28c3a1873791be3aebf8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.catalina.valves;
import java.io.IOException;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import javax.servlet.ServletException;
import javax.servlet.http.HttpSession;
import org.apache.catalina.LifecycleException;
import org.apache.catalina.connector.Request;
import org.apache.catalina.connector.Response;
import org.apache.juli.logging.Log;
import org.apache.juli.logging.LogFactory;
/**
* Web crawlers can trigger the creation of many thousands of sessions as they
* crawl a site which may result in significant memory consumption. This Valve
* ensures that crawlers are associated with a single session - just like normal
* users - regardless of whether or not they provide a session token with their
* requests.
*/
public class CrawlerSessionManagerValve extends ValveBase {
private static final Log log =
LogFactory.getLog(CrawlerSessionManagerValve.class);
private Map<String,SessionInfo> uaIpSessionInfo =
new ConcurrentHashMap<String, SessionInfo>();
private String crawlerUserAgents =
".*[bB]ot.*|.*Yahoo! Slurp.*|.*Feedfetcher-Google.*";
private Pattern uaPattern = null;
private int sessionInactiveInterval = 60;
/**
* Specify the regular expression (using {@link Pattern}) that will be used
* to identify crawlers based in the User-Agent header provided. The default
* is ".*GoogleBot.*|.*bingbot.*|.*Yahoo! Slurp.*"
*
* @param crawlerUserAgents The regular expression using {@link Pattern}
*/
public void setCrawlerUserAgents(String crawlerUserAgents) {
this.crawlerUserAgents = crawlerUserAgents;
if (crawlerUserAgents == null || crawlerUserAgents.length() == 0) {
uaPattern = null;
} else {
uaPattern = Pattern.compile(crawlerUserAgents);
}
}
/**
* @see #setCrawlerUserAgents(String)
* @return The current regular expression being used to match user agents.
*/
public String getCrawlerUserAgents() {
return crawlerUserAgents;
}
/**
* Specify the session timeout (in seconds) for a crawler's session. This is
* typically lower than that for a user session. The default is 60 seconds.
*
* @param sessionInactiveInterval The new timeout for crawler sessions
*/
public void setSessionInactiveInterval(int sessionInactiveInterval) {
this.sessionInactiveInterval = sessionInactiveInterval;
}
/**
* @see #setSessionInactiveInterval(int)
* @return The current timeout in seconds
*/
public int getSessionInactiveInterval() {
return sessionInactiveInterval;
}
@Override
protected void initInternal() throws LifecycleException {
super.initInternal();
uaPattern = Pattern.compile(crawlerUserAgents);
}
@Override
public void invoke(Request request, Response response) throws IOException,
ServletException {
boolean isBot = false;
SessionInfo sessionInfo = null;
String clientIp = null;
if (log.isDebugEnabled()) {
log.debug(request.hashCode() + ": ClientIp=" +
request.getRemoteAddr() + ", RequestedSessionId=" +
request.getRequestedSessionId());
}
// If the incoming request has a valid session ID, no action is required
if (request.getSession(false) == null) {
// Is this a crawler - check the UA headers
Enumeration<String> uaHeaders = request.getHeaders("user-agent");
String uaHeader = null;
if (uaHeaders.hasMoreElements()) {
uaHeader = uaHeaders.nextElement();
}
// If more than one UA header - assume not a bot
if (uaHeader != null && !uaHeaders.hasMoreElements()) {
if (log.isDebugEnabled()) {
log.debug(request.hashCode() + ": UserAgent=" + uaHeader);
}
if (uaPattern.matcher(uaHeader).matches()) {
isBot = true;
if (log.isDebugEnabled()) {
log.debug(request.hashCode() +
": Bot found. UserAgent=" + uaHeader);
}
}
}
// If this is a bot, is the session ID known?
if (isBot) {
clientIp = request.getRemoteAddr();
sessionInfo = uaIpSessionInfo.get(clientIp);
if (sessionInfo != null) {
request.setRequestedSessionId(sessionInfo.getSessionId());
if (log.isDebugEnabled()) {
log.debug(request.hashCode() +
": SessionID=" + sessionInfo.getSessionId());
}
}
}
}
getNext().invoke(request, response);
if (isBot) {
if (sessionInfo == null) {
// Has bot just created a session, if so make a note of it
HttpSession s = request.getSession(false);
if (s != null) {
uaIpSessionInfo.put(clientIp, new SessionInfo(s.getId()));
s.setMaxInactiveInterval(sessionInactiveInterval);
if (log.isDebugEnabled()) {
log.debug(request.hashCode() +
": New bot session. SessionID=" + s.getId());
}
}
} else {
sessionInfo.access();
if (log.isDebugEnabled()) {
log.debug(request.hashCode() +
": Bot session accessed. SessionID=" +
sessionInfo.getSessionId());
}
}
}
}
@Override
public void backgroundProcess() {
super.backgroundProcess();
long expireTime = System.currentTimeMillis() -
(sessionInactiveInterval + 60) * 1000;
Iterator<Entry<String,SessionInfo>> iter =
uaIpSessionInfo.entrySet().iterator();
// Remove any sessions in the cache that have expired.
while (iter.hasNext()) {
Entry<String,SessionInfo> entry = iter.next();
if (entry.getValue().getLastAccessed() < expireTime) {
iter.remove();
}
}
}
private static final class SessionInfo {
private final String sessionId;
private volatile long lastAccessed;
public SessionInfo(String sessionId) {
this.sessionId = sessionId;
this.lastAccessed = System.currentTimeMillis();
}
public String getSessionId() {
return sessionId;
}
public long getLastAccessed() {
return lastAccessed;
}
public void access() {
lastAccessed = System.currentTimeMillis();
}
}
}