Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NUTCH-2856] Implement a protocol-smb plugin based on hierynomus/smbj #826

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ lib/spotbugs-*
ivy/dependency-check-ant/*
.gradle*
ivy/apache-rat-*
.vscode
crawl
lewismc marked this conversation as resolved.
Show resolved Hide resolved
urls
solr_datadir
3 changes: 2 additions & 1 deletion conf/log4j2.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
<Appenders>
<RollingFile name="RollingFile" fileName="${hadoop.log.dir}/${hadoop.log.file}"
filePattern="${hadoop.log.dir}/$${date:yyyy-MM}/nutch-%d{yyyy-MM-dd}.log.gz">
<PatternLayout pattern="%d %p %c{1.} [%t] %m%n" />
<!--<PatternLayout pattern="%d %p %c{1.} [%t] %m%n" />-->
<PatternLayout pattern="%d %p %c [%t] %m%n" />
lewismc marked this conversation as resolved.
Show resolved Hide resolved
<CronTriggeringPolicy schedule="0 0 0 * * ?" evaluateOnStartup="true" />
<DefaultRolloverStrategy>
<Delete basePath="${hadoop.log.dir}" maxDepth="2">
Expand Down
57 changes: 57 additions & 0 deletions runNutch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#/bin/bash
lewismc marked this conversation as resolved.
Show resolved Hide resolved

if [ -z "$JAVA_HOME" ]
then
echo ERROR: JAVA_HOME is not set.
exit 1
fi


echo "Will remove existing CrawlDb..."
sleep 5
echo "Removing existing CrawlDb..."
banner "Delete DB"
rm -rf crawl/* || exit 1
docker exec -it solr_nutch solr delete -c nutch || exit 1

banner "Inject URLs"
./runtime/local/bin/nutch inject crawl/crawldb urls

banner "Create Solr"
cp src/plugin/indexer-solr/schema.xml solr_datadir
docker exec -it solr_nutch cp /opt/solr-9.7.0/server/solr/configsets/_default/conf/solrconfig.xml /var/solr/data/nutch
docker exec -it solr_nutch cp /opt/solr-9.7.0/server/solr/configsets/_default/conf/stopwords.txt /var/solr/data/nutch
docker exec -it solr_nutch cp /opt/solr-9.7.0/server/solr/configsets/_default/conf/protwords.txt /var/solr/data/nutch
docker exec -it solr_nutch cp /opt/solr-9.7.0/server/solr/configsets/_default/conf/synonyms.txt /var/solr/data/nutch
docker exec -it solr_nutch solr create_core -c nutch -d /var/solr/data/nutch || exit 1

while true
do
sleep 5
banner Generate Segment
./runtime/local/bin/nutch generate crawl/crawldb crawl/segments/
segment=`ls crawl/segments/ | tail -1`
echo "Found segment $segment"
sleep 5
if [ "$?" == "0" ] && [ ! -z "$segment" ]
then
banner "Fetch"
./runtime/local/bin/nutch fetch crawl/segments/$segment
if [ "$?" == "0" ]
then
sleep 5
banner "Parse"
./runtime/local/bin/nutch parse crawl/segments/$segment
sleep 5
banner UpdateDB
./runtime/local/bin/nutch updatedb crawl/crawldb crawl/segments/$segment
sleep 5
banner Index
./runtime/local/bin/nutch index crawl/crawldb crawl/segments/$segment
sleep 10
rm -rf crawl/segments/$segment
fi
else
exit 5
fi
done
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/crawl/Injector.java
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ private String filterNormalize(String url) {
if (filters != null)
url = filters.filter(url); // filter the url
} catch (Exception e) {
LOG.warn("Skipping " + url + ":" + e);
LOG.warn("Skipping {}", url, e);
url = null;
}
}
Expand Down
1 change: 1 addition & 0 deletions src/plugin/build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
<ant dir="protocol-httpclient" target="deploy"/>
<ant dir="protocol-interactiveselenium" target="deploy" />
<ant dir="protocol-okhttp" target="deploy"/>
<ant dir="protocol-smb" target="deploy"/>
lewismc marked this conversation as resolved.
Show resolved Hide resolved
<ant dir="protocol-selenium" target="deploy" />
<ant dir="publish-rabbitmq" target="deploy"/>
<ant dir="scoring-depth" target="deploy"/>
Expand Down
22 changes: 22 additions & 0 deletions src/plugin/protocol-smb/build.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="protocol-smb" default="jar-core">

<import file="../build-plugin.xml"/>

</project>
47 changes: 47 additions & 0 deletions src/plugin/protocol-smb/ivy.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<ivy-module xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="http://ant.apache.org/ivy/schemas/ivy.xsd"
xmlns:ns0="http://ant.apache.org/ivy/maven" version="2.0">
<info organisation="org.apache.nutch" module="${ant.project.name}">
<license name="Apache 2.0"/>
<ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
<description>
Apache Nutch
</description>
</info>

<configurations>
<include file="../../..//ivy/ivy-configurations.xml"/>
</configurations>

<publications>
<!--get the artifact from our module name-->
<artifact conf="master"/>
</publications>

<dependencies>
<dependency org="com.hierynomus" name="smbj" rev="0.13.0"/>
<dependency org="net.engio" name="mbassador" rev="1.3.0"/>
lewismc marked this conversation as resolved.
Show resolved Hide resolved
<dependency org="org.bouncycastle" name="bcprov-jdk18on" rev="1.75"/>
<dependency org="com.hierynomus" name="asn-one" rev="0.6.0"/>
<dependency org="commons-io" name="commons-io" rev="2.17.0"/>
HiranChaudhuri marked this conversation as resolved.
Show resolved Hide resolved
</dependencies>

</ivy-module>
53 changes: 53 additions & 0 deletions src/plugin/protocol-smb/plugin.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<plugin
id="protocol-smb"
name="SMB Protocol based on https://github.com/hierynomus/smbj"
version="1.0.0"
provider-name="Hiran Chaudhuri">

<runtime>
<library name="asn-one-0.6.0.jar"/>
lewismc marked this conversation as resolved.
Show resolved Hide resolved
<library name="bcprov-jdk18on-1.75.jar"/>
<library name="mbassador-1.3.0.jar"/>
<library name="protocol-smb.jar">
<export name="*"/>
</library>
<library name="smbj-0.13.0.jar"/>

<library name="commons-io-2.17.0.jar"/>
</runtime>

<requires>
<import plugin="nutch-extensionpoints"/>
</requires>

<extension id="org.apache.nutch.protocol.smb"
name="SmbProtocol"
point="org.apache.nutch.protocol.Protocol">

<implementation id="org.apache.nutch.protocol.smb.Smb"
class="org.apache.nutch.protocol.smb.SmbProtocol">
<parameter name="protocolName" value="smb"/>
<parameter name="urlStreamHandler" value="org.apache.nutch.protocol.smb.SmbHandler"/>
</implementation>

</extension>

</plugin>
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.protocol.smb;

import java.net.URL;
import java.net.URLConnection;
import java.net.URLStreamHandler;

public class SmbHandler extends URLStreamHandler {

@Override
protected URLConnection openConnection(URL u) {
return new SmbURLConnection(u);
}
}
Loading