|
|
|
package org.apache.nutch.protocol.httpclient; |
|
|
|
import java.net.URL; |
|
import java.text.ParseException; |
|
import java.text.SimpleDateFormat; |
|
import java.util.ArrayList; |
|
import java.util.Date; |
|
import java.util.List; |
|
|
|
import org.apache.hadoop.io.Text; |
|
import org.apache.hadoop.io.Writable; |
|
import org.apache.http.cookie.Cookie; |
|
import org.apache.http.impl.cookie.BasicClientCookie; |
|
import org.apache.nutch.crawl.CrawlDatum; |
|
|
|
/** |
|
* Returns a String representing the cookies to be sent to the protocol request. |
|
* |
|
* @see http://en.wikipedia.org/wiki/HTTP_cookie |
|
**/ |
|
|
|
public class CookieConverter { |
|
|
|
public final static Text useCookiesKey = new Text("nutch.use.cookies"); |
|
public final static Text cookiesKey = new Text("nutch.cookies"); |
|
|
|
public static SimpleDateFormat dateFormat = new SimpleDateFormat( |
|
"EEE, dd MMM yyyy HH:mm:ss zzz"); |
|
|
|
public static List<Cookie> getCookies(CrawlDatum datum, URL targetURL) { |
|
|
|
ArrayList<Cookie> list = new ArrayList<Cookie>(); |
|
|
|
Writable cookies = datum.getMetaData().get(cookiesKey); |
|
if (cookies == null || cookies.toString().length() == 0) |
|
return list; |
|
|
|
String[] cookiestrings = cookies.toString().split("\t"); |
|
for (String cs : cookiestrings) { |
|
String name = null; |
|
String value = null; |
|
|
|
String expires = null; |
|
String domain = null; |
|
String path = null; |
|
|
|
boolean secure = false; |
|
|
|
String[] tokens = cs.split(";"); |
|
|
|
int equals = tokens[0].indexOf("="); |
|
name = tokens[0].substring(0, equals); |
|
value = tokens[0].substring(equals + 1); |
|
|
|
for (int i = 1; i < tokens.length; i++) { |
|
String ti = tokens[i].trim(); |
|
if (ti.equalsIgnoreCase("secure")) |
|
secure = true; |
|
if (ti.toLowerCase().startsWith("path=")) { |
|
path = ti.substring(5); |
|
} |
|
if (ti.toLowerCase().startsWith("domain=")) { |
|
domain = ti.substring(7); |
|
} |
|
if (ti.toLowerCase().startsWith("expires=")) { |
|
expires = ti.substring(8); |
|
} |
|
} |
|
|
|
BasicClientCookie cookie = new BasicClientCookie(name, value); |
|
|
|
// check domain |
|
if (domain != null) { |
|
cookie.setDomain(domain); |
|
|
|
if (!targetURL.getHost().contains(domain)) |
|
continue; |
|
} |
|
|
|
// check path |
|
if (path != null) { |
|
cookie.setPath(path); |
|
|
|
if (!targetURL.getPath().startsWith(path)) |
|
continue; |
|
} |
|
|
|
// check secure |
|
if (secure) { |
|
cookie.setSecure(secure); |
|
|
|
if (!targetURL.getProtocol().equalsIgnoreCase("https")) |
|
continue; |
|
} |
|
|
|
// check expiration |
|
if (expires != null) { |
|
try { |
|
Date expirationDate = dateFormat.parse(expires); |
|
|
|
// check that it hasn't expired? |
|
if (cookie.isExpired(new Date())) |
|
continue; |
|
|
|
cookie.setExpiryDate(expirationDate); |
|
} catch (ParseException e) { |
|
// ignore exceptions |
|
} |
|
} |
|
|
|
// attach additional infos to cookie |
|
list.add(cookie); |
|
} |
|
|
|
return list; |
|
} |
|
} |