diff --git a/.gitignore b/.gitignore index 8c521aa68e..9cac3379cb 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ lib/spotbugs-* ivy/dependency-check-ant/* .gradle* ivy/apache-rat-* +.vscode diff --git a/conf/log4j2.xml b/conf/log4j2.xml index 9eb807b4fa..4aa7b2dd58 100644 --- a/conf/log4j2.xml +++ b/conf/log4j2.xml @@ -25,7 +25,8 @@ - + + diff --git a/conf/url-authentication.xml.template b/conf/url-authentication.xml.template new file mode 100644 index 0000000000..17c13fb962 --- /dev/null +++ b/conf/url-authentication.xml.template @@ -0,0 +1,22 @@ + + + + + + + + \ No newline at end of file diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java index 0d3740eb44..f1258daab7 100644 --- a/src/java/org/apache/nutch/crawl/Injector.java +++ b/src/java/org/apache/nutch/crawl/Injector.java @@ -158,7 +158,7 @@ private String filterNormalize(String url) { if (filters != null) url = filters.filter(url); // filter the url } catch (Exception e) { - LOG.warn("Skipping " + url + ":" + e); + LOG.warn("Skipping {}", url, e); url = null; } } diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 498259a950..92430dd9f4 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -78,6 +78,7 @@ + @@ -142,6 +143,7 @@ + @@ -226,6 +228,7 @@ + diff --git a/src/plugin/protocol-smb/build.xml b/src/plugin/protocol-smb/build.xml new file mode 100755 index 0000000000..54e6d24059 --- /dev/null +++ b/src/plugin/protocol-smb/build.xml @@ -0,0 +1,22 @@ + + + + + + + diff --git a/src/plugin/protocol-smb/ivy.xml b/src/plugin/protocol-smb/ivy.xml new file mode 100755 index 0000000000..94c0bb3bb4 --- /dev/null +++ b/src/plugin/protocol-smb/ivy.xml @@ -0,0 +1,54 @@ + + + + + + + + + Apache Nutch + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/protocol-smb/plugin.xml b/src/plugin/protocol-smb/plugin.xml new file mode 100755 index 0000000000..420ff89d9c --- /dev/null +++ b/src/plugin/protocol-smb/plugin.xml @@ -0,0 +1,53 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbHandler.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbHandler.java new file mode 100644 index 0000000000..cb7135f689 --- /dev/null +++ b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbHandler.java @@ -0,0 +1,29 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.smb; + +import java.net.URL; +import java.net.URLConnection; +import java.net.URLStreamHandler; + +public class SmbHandler extends URLStreamHandler { + + @Override + protected URLConnection openConnection(URL u) { + return new SmbURLConnection(u); + } +} diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java new file mode 100755 index 0000000000..8e8262d53f --- /dev/null +++ b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbProtocol.java @@ -0,0 +1,475 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.smb; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Collection; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolStatus; +import org.apache.nutch.protocol.RobotRulesParser; +import org.apache.nutch.protocol.smb.URLAuthentication.Authentication; +import org.xml.sax.InputSource; +import com.hierynomus.msdtyp.AccessMask; +import com.hierynomus.mserref.NtStatus; +import com.hierynomus.msfscc.FileAttributes; +import com.hierynomus.msfscc.fileinformation.FileAllInformation; +import com.hierynomus.msfscc.fileinformation.FileIdBothDirectoryInformation; +import com.hierynomus.mssmb2.SMBApiException; +import com.hierynomus.mssmb2.SMB2CreateDisposition; +import com.hierynomus.mssmb2.SMB2CreateOptions; +import com.hierynomus.mssmb2.SMB2ShareAccess; +import com.hierynomus.smbj.auth.AuthenticationContext; +import com.hierynomus.smbj.connection.Connection; +import com.hierynomus.smbj.session.Session; +import com.hierynomus.smbj.share.DiskShare; +import com.hierynomus.smbj.share.File; +import com.hierynomus.smbj.SMBClient; + +import crawlercommons.robots.BaseRobotRules; +import crawlercommons.robots.SimpleRobotRules; +import crawlercommons.robots.SimpleRobotRulesParser; +import java.util.Map; +import java.util.TreeMap; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class SmbProtocol implements Protocol, AutoCloseable { + protected static final Logger LOG = LoggerFactory.getLogger(SmbProtocol.class); + + private Configuration conf; + private URLAuthentication urlAuthentication; + + private int contentLimit; + private Set ignoreFiles; + private Collection agentNames; + + private long scannedFolderCount; + private long scannedFileCount; + private long truncatedFileCount; + + private Map robotsCache = new TreeMap<>(); + + public SmbProtocol() { + // Place here only files that SMB needs to ignore. Other files such as + // version control (.git, .svn) can be ignored via the regex url filter. + this.ignoreFiles = new HashSet<>(); + ignoreFiles.add("."); + ignoreFiles.add(".."); + } + + @Override + public Configuration getConf() { + LOG.debug("getConf()"); + return this.conf; + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + + agentNames = conf.getTrimmedStringCollection("smb.agent.name"); + if (agentNames == null || agentNames.isEmpty()) { + throw new IllegalArgumentException("Config parameter 'smb.agent.name' not set or empty."); + } + + // load authentication data + String filename = conf.get("smb.url-authentication.file", "url-authentication.xml"); + InputStream ssInputStream = conf.getConfResourceAsInputStream(filename); + InputSource inputSource = new InputSource(ssInputStream); + urlAuthentication = URLAuthentication.loadAuthentication(inputSource); + + contentLimit = conf.getInt("smb.content.limit", Integer.MAX_VALUE-100); + LOG.info("Understood smb.content.limit={}", contentLimit); + } + + /** + * list directory. + * + * @return some HTML string + */ + private String getDirectoryContent(DiskShare share, String shareName, String path) throws UnsupportedEncodingException { + StringBuffer sb = new StringBuffer(); + sb.append(""); + sb.append("Index of ").append("/").append(shareName).append(path).append(""); + sb.append(""); + sb.append("

Index of ").append("/").append(shareName).append(path).append("

"); + sb.append("
");
+      for (FileIdBothDirectoryInformation f : share.list(path)) {
+        if (ignoreFiles.contains(f.getFileName())) {
+          LOG.debug("File skipped: " + f.getFileName());
+          continue;
+        }
+        boolean isDir = share.folderExists(path + "/" + f.getFileName());
+
+        sb.append("").append(f.getFileName());
+        if (isDir) {
+          sb.append("/");
+        }
+        sb.append("\t").append(f.getLastWriteTime()).append("\n");
+      }
+      sb.append("
"); + sb.append(""); + + return sb.toString(); + } + + private static final char[] HEX_ARRAY = "0123456789ABCDEF".toCharArray(); + + private Connection getSMBConnection(URL url) throws UnsupportedEncodingException, IOException { + String hostname = url.getHost(); + int port = url.getPort(); + String shareAndPath = url.getPath(); + + if (port == -1) { + port = 445; + } + String[] components = shareAndPath.split("/", 3); + String shareName = components[1]; + shareName = java.net.URLDecoder.decode(shareName, StandardCharsets.UTF_8.name()); + String path = components.length>2 ? "/" + components[2]: "/"; + path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8.name()); + + LOG.trace("hostname={}", hostname); + LOG.trace("port={}", port); + LOG.trace("shareAndPath={}", shareAndPath); + LOG.trace("share={}", shareName); + LOG.trace("path={}", path); + + // todo: we construct and destruct the connection for each and every URL. Can connection pools improve? + SMBClient client = new SMBClient(); + Connection connection = client.connect(hostname, port); + return connection; + } + + private URL getRobotsUrl(URL url) throws URISyntaxException, MalformedURLException { + String shareAndPath = url.getPath(); + String[] components = shareAndPath.split("/", 3); + String shareName = components[1]; + return new URI(url.getProtocol(), url.getUserInfo(), url.getHost(), url.getPort(), "/" + shareName + "/robots.txt", null, null).toURL(); + } + + private DiskShare getDiskShare(URL url, Connection connection) throws UnsupportedEncodingException, IOException { + if (urlAuthentication == null) { + throw new IllegalStateException("urlAuthentication must not be null"); + } + + String shareAndPath = url.getPath(); + String[] components = shareAndPath.split("/", 3); + String shareName = components[1]; + shareName = java.net.URLDecoder.decode(shareName, StandardCharsets.UTF_8.name()); + String path = components.length>2 ? "/" + components[2]: "/"; + path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8.name()); + + LOG.trace("shareAndPath={}", shareAndPath); + LOG.trace("share={}", shareName); + LOG.trace("path={}", path); + + Authentication auth = urlAuthentication.getAuthenticationFor(url.toString()); + Session session = null; + if (auth == null) { + LOG.trace("Anonymously connecting to {}", url); + session = connection.authenticate( + AuthenticationContext.anonymous() + ); + } else { + LOG.trace("Authenticating with {}", auth); + session = connection.authenticate( + new AuthenticationContext(auth.getUser(), auth.getPassword(), auth.getDomain()) + ); + } + // Connect to Share + DiskShare share = (DiskShare) session.connectShare(shareName); + return share; + } + + /** + * Splits an absolute path into share and path. + * The share is the top level directory, everything else will become the path. + * Since the whole structure can be transported via URLs, URL-decoding is also + * applied. + * + * @param url the url to parse + * @return an array consisting of [share, path] + */ + private String[] getSmbShareAndPath(URL url) throws UnsupportedEncodingException { + String shareAndPath = url.getPath(); + String[] components = shareAndPath.split("/", 3); + String shareName = components[1]; + shareName = java.net.URLDecoder.decode(shareName, StandardCharsets.UTF_8.name()); + String path = components.length>2 ? "/" + components[2]: "/"; + path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8.name()); + + return new String[]{shareName, path}; + } + + private Content getFileContent(String urlstr, String base, DiskShare share, String path, Metadata metadata) throws IOException { + FileAllInformation fileInfo = share.getFileInformation(path); + File file = share.openFile(path, EnumSet.of(AccessMask.GENERIC_READ), null, SMB2ShareAccess.ALL, SMB2CreateDisposition.FILE_OPEN, null); + + InputStream fileIn = file.getInputStream(); + byte[] bytes = null; + long fileSize = fileInfo.getStandardInformation().getEndOfFile(); + long fetchSize = fileSize; + metadata.add("fileSize", String.valueOf(fileSize)); + + // todo: we run into issues if the file is bigger than 2 GB. I made the limit configurable + // but e.g. zip can no longer be evaluated if too big. + if (fetchSize > contentLimit) { + LOG.info("trunkating {}", urlstr); + fetchSize = contentLimit; + + // todo: this metadata seems to be not available for the indexer. However it might be useful to know the content + // discovery is incomplete + metadata.add("truncated", String.valueOf(fetchSize)); + truncatedFileCount++; + } + + bytes = IOUtils.toByteArray(fileIn, fetchSize); // read inputstream into byte array + + LOG.trace("retrieved {} bytes", bytes.length); + + if (LOG.isTraceEnabled()) { + StringBuilder sb = new StringBuilder(); + for (int i=0; i>>4]).append(HEX_ARRAY[b & 0xF]); + } + LOG.trace("retrieved {} bytes starting with {}", bytes.length, sb.toString()); + } + LOG.trace("metadata={}", metadata); + + return new Content(urlstr, base, bytes, "application/octet-stream", metadata, getConf()); + } + + private String getBase(Text urlstr) { + // construct a suitable base + String base = urlstr.toString(); + if (base.endsWith("/")) { + base = base + "."; + } + if (!base.endsWith("/.")) { + base = base + "/."; + } + + LOG.trace("base={}", base); + return base; + } + + /** + * Get the {@link ProtocolOutput} for a given url and crawldatum. + * + * @param url canonical url + * @param datum associated {@link org.apache.nutch.crawl.CrawlDatum} + * @return the {@link ProtocolOutput} + * @see https://github.com/apache/nutch/blob/master/src/java/org/apache/nutch/crawl/CrawlDatum.java + */ + @Override + public ProtocolOutput getProtocolOutput(Text urlstr, CrawlDatum datum) { + LOG.debug("getProtocolOutput({}, {})", urlstr, datum); + + try { + URL url = new URI(urlstr.toString()).toURL(); + String[] components = getSmbShareAndPath(url); + String shareName = components[0]; + String path = components[1]; + + try (Connection connection = getSMBConnection(url)) { + String base = base = getBase(urlstr); + + try (DiskShare share = getDiskShare(url, connection)) { + + // now get the content + if (share.folderExists(path)) { + String htmlContent = getDirectoryContent(share, shareName, path); + LOG.trace("directory={}", htmlContent); + scannedFolderCount++; + + return new ProtocolOutput( + new Content(urlstr.toString(), base, htmlContent.getBytes(), "text/html", new Metadata(), getConf()), + ProtocolStatus.STATUS_SUCCESS + ); + } else if (share.fileExists(path)) { + // todo: how can we store this, and maybe more metadata? + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "application/octet-stream"); + + Content content = getFileContent(urlstr.toString(), url.toURI().resolve("..").toString(), share, path, metadata); + scannedFileCount++; + + // create content and return result + return new ProtocolOutput( + content, + ProtocolStatus.STATUS_SUCCESS + ); + + } else { + // communicate error + String message = "File not found: " + urlstr; + LOG.info(message); + return new ProtocolOutput( + new Content(urlstr.toString(), base, message.getBytes(), "text/plain", new Metadata(), getConf()), + ProtocolStatus.STATUS_NOTFOUND + ); + } + } catch (SMBApiException e) { + if (e.getStatus() == NtStatus.STATUS_BAD_NETWORK_NAME) { + + // this URL makes to sense to be scanned. Make sure this URL gets evicted from the CrawlDB. + LOG.error("Bad network name: {}", urlstr); + return new ProtocolOutput( + new Content(urlstr.toString(), base, e.getMessage().getBytes(), "text/plain", new Metadata(), getConf()), + ProtocolStatus.STATUS_NOTFOUND + ); + } else { + throw e; + } + } + } + + } catch(Exception e) { + LOG.error("Could not get protocol output for {}", urlstr, e); + return new ProtocolOutput(null, new ProtocolStatus(e)); + } + } + + /** + * Retrieve robot rules applicable for this URL. + * + * @param url + * URL to check + * @param datum + * page datum + * @param robotsTxtContent + * container to store responses when fetching the robots.txt file for + * debugging or archival purposes. Instead of a robots.txt file, it + * may include redirects or an error page (404, etc.). Response + * {@link Content} is appended to the passed list. If null is passed + * nothing is stored. + * @return robot rules (specific for this URL or default), never null + */ + @Override + public BaseRobotRules getRobotRules(Text urlstr, CrawlDatum datum, List robotsTxtContent) { + LOG.trace("getRobotRules({}, {}, {})", urlstr, datum, robotsTxtContent); + + URL url = null; + URL robotsURL = null; + try { + // calculate new URL + url = new URI(urlstr.toString()).toURL(); + robotsURL = getRobotsUrl(url); + LOG.debug("Robots URL = {}", robotsURL); + + + // if we are running multithreaded, make only one thread at a time check + // the cache. It means if we miss, only one thread will go and fetch/parse + // robots.txt while other threads will wait + synchronized(robotsCache) { + if (robotsCache.containsKey(robotsURL.toString())) { + LOG.debug("Found {} in cache", robotsURL); + return robotsCache.get(robotsURL.toString()); + } + + try (Connection connection = getSMBConnection(url)) { + try (DiskShare share = getDiskShare(url, connection)) { + // search for the file compliant to https://www.rfc-editor.org/rfc/rfc9309.html + // chapter 2.3 + if (!share.fileExists("/robots.txt")) { + // no robots file? Then we can scan everything + LOG.info("No robots.txt found for {} -> crawl everything", robotsURL); + BaseRobotRules rules = RobotRulesParser.EMPTY_RULES; + robotsCache.put(robotsURL.toString(), rules); // cache the value - we will need it more often + return rules; + } + + Metadata metadata = new Metadata(); + Content content = getFileContent(urlstr.toString(), url.toURI().resolve("..").toString(), share, "/robots.txt", metadata); + + // make use of + // https://crawler-commons.github.io/crawler-commons/1.4/crawlercommons/robots/SimpleRobotRulesParser.html#parseContent(java.lang.String,byte%5B%5D,java.lang.String,java.util.Collection) + SimpleRobotRulesParser simpleRobotsRulesParser = new SimpleRobotRulesParser(); + SimpleRobotRules rules = simpleRobotsRulesParser.parseContent(urlstr.toString(), content.getContent(), content.getContentType(), agentNames); + robotsCache.put(robotsURL.toString(), rules); // cache the value - we will need it more often + LOG.info("found and parsed {}", robotsURL); + return rules; + } catch (SMBApiException e) { + if (e.getStatus() == NtStatus.STATUS_BAD_NETWORK_NAME) { + + // this URL makes to sense to be scanned. But we assume 'empty rules' as no robots.txt exists and + // in getProtocolOutput we can make sure this URL gets evicted from the CrawlDB. + LOG.error("Bad network name: {} -> crawl everything", urlstr); + BaseRobotRules rules = RobotRulesParser.EMPTY_RULES; + robotsCache.put(robotsURL.toString(), rules); // cache the value - we will need it more often + return rules; + } else { + throw e; + } + } // DiskShare + } // Connection + } // synchronized + + } catch (Exception e) { + LOG.info("Could not get robot rules for {} (initially {})", robotsURL, urlstr, e); + return RobotRulesParser.DEFER_VISIT_RULES; + } + } + + /** + * Closes this resource, relinquishing any underlying resources. + * + * Some statistics is printed. + */ + public void close() { + LOG.info("Closing plugin"); + LOG.info("Scanned folders: {}", scannedFolderCount); + LOG.info("Scanned files {}", scannedFileCount); + LOG.info("Truncated files {}", truncatedFileCount); + } + + /** + * As Nutch does not close protocols let's do that before GC. + */ + public void finalize() { + close(); + } +} diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbURLConnection.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbURLConnection.java new file mode 100644 index 0000000000..22170478e0 --- /dev/null +++ b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SmbURLConnection.java @@ -0,0 +1,77 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.smb; + +import java.io.UnsupportedEncodingException; +import java.net.URL; +import java.net.URLConnection; +import java.nio.charset.StandardCharsets; + +public class SmbURLConnection extends URLConnection { + + private String schema; + private String host; + private int port; + private String share; + private String path; + + public SmbURLConnection(URL url) { + super(url); + + try { + String u = java.net.URLDecoder.decode(url.toString(), StandardCharsets.UTF_8.name()); + String[] parts = u.split("://"); + schema = parts[0]; + u = parts[1]; + + parts = u.split("[:/]", 2); + host = parts[0]; + u = parts[1]; // we have share and path now + + parts = u.split("/", 2); + share = parts[0]; + + path = "/" + parts[1]; + } catch (UnsupportedEncodingException e) { + throw new IllegalArgumentException("could not decypher given url", e); + } + } + + public String getSchema() { + return schema; + } + + public String getHost() { + return host; + } + + public int getPort() { + return port; + } + + public String getShare() { + return share; + } + + public String getPath() { + return path; + } + + public void connect() { + + } +} \ No newline at end of file diff --git a/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/URLAuthentication.java b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/URLAuthentication.java new file mode 100644 index 0000000000..d623b28ee1 --- /dev/null +++ b/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/URLAuthentication.java @@ -0,0 +1,136 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.smb; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; + +public class URLAuthentication { + protected static final Logger LOG = LoggerFactory.getLogger(URLAuthentication.class); + + public static class Authentication { + protected static final Logger LOG = LoggerFactory.getLogger(Authentication.class); + + private Pattern pattern; + private String user; + private String domain; + private char[] password; + + protected Authentication(String pattern, String user, String domain, char[] password) { + LOG.debug("Authentication({}, {}, {}, *****)", pattern, user, domain); + if (pattern == null || pattern.isEmpty()) { + throw new IllegalArgumentException("pattern must not be null"); + } + if (user == null || user.isEmpty()) { + throw new IllegalArgumentException("user must not be null"); + } + if (password == null) { + throw new IllegalArgumentException("password must not be null"); + } + this.pattern = Pattern.compile(pattern); + this.user = user; + this.domain = domain; + this.password = password; + } + + public boolean matches(String url) { + LOG.debug("matches({})", url); + return pattern.matcher(url).matches(); + } + + protected Pattern getPattern() { + return pattern; + } + + public String getUser() { + return user; + } + + public char[] getPassword() { + return password; + } + + public String getDomain() { + return domain; + } + } + + private List authentications; + + public static URLAuthentication loadAuthentication(InputSource inputSource) { + LOG.debug("loadAuthentication(...)"); + + URLAuthentication result = new URLAuthentication(); + + try { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + DocumentBuilder builder = factory.newDocumentBuilder(); + Document document = builder.parse(inputSource); + Element rootElement = document.getDocumentElement(); + NodeList authList = rootElement.getElementsByTagName("authentication"); + for (int i = 0; i(); + } + + private void addAuthentication(Authentication auth) { + LOG.debug("addAuthentication({})", auth); + authentications.add(auth); + } + + public Authentication getAuthenticationFor(String url) { + LOG.debug("getAuthenticationFor({})", url); + + for (Authentication auth: authentications) { + if (auth.matches(url)) { + LOG.trace("matched pattern {}", auth.getPattern()); + return auth; + } else { + LOG.trace("missed pattern {}", auth.getPattern()); + } + } + + LOG.trace("Nothing found in {} entries", authentications.size()); + return null; + } + +} diff --git a/src/plugin/protocol-smb/src/test/org/apache/nutch/protocol/smb/TestSmbProtocol.java b/src/plugin/protocol-smb/src/test/org/apache/nutch/protocol/smb/TestSmbProtocol.java new file mode 100644 index 0000000000..7243148251 --- /dev/null +++ b/src/plugin/protocol-smb/src/test/org/apache/nutch/protocol/smb/TestSmbProtocol.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package plugin.protocol-smb.src.test.org.apache.nutch.protocol.smb; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class TestSmbProtocol { + protected static final Logger LOG = LoggerFactory.getLogger(TestSmbProtocol.class); + + @Before + public void setUp() { + LOG.warn("setUp()"); + Assert.fail(); + } + + @Test + public void testSetContentType1() { + LOG.warn("testSetContentType1()"); + Assert.fail(); + } + + @Test + public void testSetContentType12) { + LOG.warn("testSetContentType2()"); + Assert.fail(); + } + +}