|
package com.digitalpebble.crawl; |
|
|
|
import java.util.List; |
|
|
|
import org.openqa.selenium.By; |
|
import org.openqa.selenium.WebElement; |
|
import org.openqa.selenium.remote.RemoteWebDriver; |
|
import org.openqa.selenium.support.ui.ExpectedConditions; |
|
import org.openqa.selenium.support.ui.WebDriverWait; |
|
|
|
import com.digitalpebble.stormcrawler.Metadata; |
|
import com.digitalpebble.stormcrawler.protocol.ProtocolResponse; |
|
import com.digitalpebble.stormcrawler.protocol.selenium.NavigationFilter; |
|
|
|
public class JobBoardNavigationFilter extends NavigationFilter { |
|
|
|
@Override |
|
public ProtocolResponse filter(RemoteWebDriver driver, Metadata metadata) { |
|
StringBuilder dummyContent = new StringBuilder("<html>"); |
|
|
|
// check that we are on the right sort of page |
|
if (!driver.getCurrentUrl().contains("JobBoard/ListJobs.aspx")) { |
|
return null; |
|
} |
|
|
|
// iterate on the result pages |
|
while (true) { |
|
// get the links for the current page |
|
List<WebElement> anchors = driver.findElementsByTagName("A"); |
|
for (WebElement element : anchors) { |
|
String href = element.getAttribute("href"); |
|
if (!href.contains("JobDetails.aspx")) |
|
continue; |
|
// generate an outlink |
|
dummyContent.append("<a href=\"").append(href).append("\">"); |
|
dummyContent.append(element.getText()).append("<a>\n"); |
|
} |
|
// see if there is a clickable 'next page' button |
|
WebDriverWait wait = new WebDriverWait(driver, 10); |
|
WebElement nextButton = wait.until(ExpectedConditions |
|
.presenceOfElementLocated(By.id("__Next"))); |
|
if (!nextButton.isEnabled()) |
|
break; |
|
nextButton.click(); |
|
} |
|
|
|
dummyContent.append("</html>"); |
|
return new ProtocolResponse(dummyContent.toString().getBytes(), 200, |
|
metadata); |
|
} |
|
|
|
} |