Skip to content

Commit d2d231e

Browse files
committed
Investigated the huge number of invalid queries and it turns out, that query.wikidata automatically adds some prefixes to theire queries which we don't do. Implemented this and fixed some minor issues when parsing URL's.
1 parent 4d192da commit d2d231e

10 files changed

+263
-172
lines changed

geosoft_checks.xml

+121-115
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
<?xml version="1.0" encoding="UTF-8"?>
2-
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.3//EN" "http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
2+
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
3+
"http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
34

45
<!--
56
This configuration file was written by the eclipse-cs plugin configuration editor
@@ -9,118 +10,123 @@
910
Description: none
1011
-->
1112
<module name="Checker">
12-
<property name="severity" value="warning"/>
13-
<module name="TreeWalker">
14-
<module name="JavadocType">
15-
<property name="authorFormat" value="\S"/>
16-
</module>
17-
<module name="JavadocMethod">
18-
<property name="allowUndeclaredRTE" value="true"/>
19-
</module>
20-
<module name="JavadocVariable"/>
21-
<module name="JavadocStyle"/>
22-
<module name="PackageName">
23-
<property name="format" value="^[a-z]+(\.[a-z][a-z0-9]*)*$"/>
24-
</module>
25-
<module name="TypeName"/>
26-
<module name="MemberName"/>
27-
<module name="LocalFinalVariableName"/>
28-
<module name="LocalVariableName"/>
29-
<module name="ParameterName"/>
30-
<module name="StaticVariableName"/>
31-
<module name="AvoidStarImport"/>
32-
<module name="IllegalImport"/>
33-
<module name="RedundantImport"/>
34-
<module name="UnusedImports"/>
35-
<module name="ImportOrder">
36-
<property name="groups" value="java,javax"/>
37-
<property name="separated" value="true"/>
38-
</module>
39-
<module name="LineLength">
40-
<property name="severity" value="ignore"/>
41-
<metadata name="net.sf.eclipsecs.core.lastEnabledSeverity" value="inherit"/>
42-
</module>
43-
<module name="MethodParamPad"/>
44-
<module name="NoWhitespaceAfter"/>
45-
<module name="NoWhitespaceBefore"/>
46-
<module name="OperatorWrap">
47-
<property name="option" value="eol"/>
48-
<property name="tokens" value="ASSIGN, BAND, BAND_ASSIGN, BOR, BOR_ASSIGN, BSR, BSR_ASSIGN, BXOR, BXOR_ASSIGN, COLON, DIV, DIV_ASSIGN, EQUAL, GE, GT, LAND, LE, LITERAL_INSTANCEOF, LOR, LT, MINUS, MINUS_ASSIGN, MOD, MOD_ASSIGN, NOT_EQUAL, PLUS, PLUS_ASSIGN, QUESTION, SL, SL_ASSIGN, SR, SR_ASSIGN, STAR, STAR_ASSIGN"/>
49-
</module>
50-
<module name="ParenPad"/>
51-
<module name="TypecastParenPad"/>
52-
<module name="WhitespaceAfter">
53-
<property name="tokens" value="COMMA, SEMI, TYPECAST"/>
54-
</module>
55-
<module name="WhitespaceAround">
56-
<property name="tokens" value="ASSIGN, BAND, BAND_ASSIGN, BOR, BOR_ASSIGN, BSR, BSR_ASSIGN, BXOR, BXOR_ASSIGN, COLON, DIV, DIV_ASSIGN, EQUAL, GE, GT, LAND, LCURLY, LE, LITERAL_ASSERT, LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR, LITERAL_IF, LITERAL_RETURN, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE, LOR, LT, MINUS, MINUS_ASSIGN, MOD, MOD_ASSIGN, NOT_EQUAL, PLUS, PLUS_ASSIGN, QUESTION, RCURLY, SL, SLIST, SL_ASSIGN, SR, SR_ASSIGN, STAR, STAR_ASSIGN, TYPE_EXTENSION_AND, WILDCARD_TYPE"/>
57-
</module>
58-
<module name="ModifierOrder"/>
59-
<module name="RedundantModifier"/>
60-
<module name="EmptyBlock"/>
61-
<module name="LeftCurly">
62-
<property name="option" value="nl"/>
63-
<property name="tokens" value="CLASS_DEF, CTOR_DEF, INTERFACE_DEF, METHOD_DEF"/>
64-
</module>
65-
<module name="LeftCurly">
66-
<property name="tokens" value="LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR, LITERAL_IF, LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE"/>
67-
</module>
68-
<module name="RightCurly">
69-
<property name="option" value="alone"/>
70-
<property name="tokens" value="LITERAL_CATCH, LITERAL_FINALLY, LITERAL_ELSE"/>
71-
</module>
72-
<module name="AvoidNestedBlocks">
73-
<property name="allowInSwitchCase" value="true"/>
74-
</module>
75-
<module name="CovariantEquals"/>
76-
<module name="EmptyStatement"/>
77-
<module name="EqualsHashCode"/>
78-
<module name="HiddenField"/>
79-
<module name="InnerAssignment"/>
80-
<module name="MissingSwitchDefault"/>
81-
<module name="ModifiedControlVariable"/>
82-
<module name="SimplifyBooleanExpression"/>
83-
<module name="SimplifyBooleanReturn"/>
84-
<module name="StringLiteralEquality"/>
85-
<module name="SuperClone"/>
86-
<module name="SuperFinalize"/>
87-
<module name="IllegalCatch"/>
88-
<module name="IllegalThrows"/>
89-
<module name="PackageDeclaration"/>
90-
<module name="DeclarationOrder"/>
91-
<module name="ParameterAssignment"/>
92-
<module name="ExplicitInitialization"/>
93-
<module name="DefaultComesLast"/>
94-
<module name="FallThrough"/>
95-
<module name="MultipleVariableDeclarations"/>
96-
<module name="UnnecessaryParentheses"/>
97-
<module name="VisibilityModifier"/>
98-
<module name="FinalClass"/>
99-
<module name="HideUtilityClassConstructor"/>
100-
<module name="DesignForExtension"/>
101-
<module name="MutableException"/>
102-
<module name="ClassFanOutComplexity"/>
103-
<module name="CyclomaticComplexity">
104-
<property name="max" value="20"/>
105-
</module>
106-
<module name="NPathComplexity"/>
107-
<module name="RegexpSinglelineJava">
108-
<property name="format" value="\s+$"/>
109-
<property name="message" value="Line has trailing spaces."/>
110-
</module>
111-
<module name="TodoComment">
112-
<property name="format" value="TODO"/>
113-
</module>
114-
<module name="UncommentedMain"/>
115-
<module name="UpperEll"/>
116-
<module name="ArrayTypeStyle"/>
117-
<module name="Indentation">
118-
<property name="basicOffset" value="2"/>
119-
<property name="caseIndent" value="2"/>
120-
</module>
121-
<module name="TrailingComment"/>
122-
</module>
123-
<module name="JavadocPackage"/>
124-
<module name="NewlineAtEndOfFile"/>
125-
<module name="Translation"/>
13+
<property name="severity" value="warning"/>
14+
<module name="TreeWalker">
15+
<module name="JavadocType">
16+
<property name="authorFormat" value="\S"/>
17+
</module>
18+
<module name="JavadocMethod">
19+
<property name="allowUndeclaredRTE" value="true"/>
20+
</module>
21+
<module name="JavadocVariable"/>
22+
<module name="JavadocStyle"/>
23+
<module name="PackageName">
24+
<property name="format" value="^[a-z]+(\.[a-z][a-z0-9]*)*$"/>
25+
</module>
26+
<module name="TypeName"/>
27+
<module name="MemberName"/>
28+
<module name="LocalFinalVariableName"/>
29+
<module name="LocalVariableName"/>
30+
<module name="ParameterName"/>
31+
<module name="StaticVariableName"/>
32+
<module name="AvoidStarImport"/>
33+
<module name="IllegalImport"/>
34+
<module name="RedundantImport"/>
35+
<module name="UnusedImports"/>
36+
<module name="ImportOrder">
37+
<property name="groups" value="java,javax"/>
38+
<property name="separated" value="true"/>
39+
</module>
40+
<module name="LineLength">
41+
<property name="severity" value="ignore"/>
42+
<metadata name="net.sf.eclipsecs.core.lastEnabledSeverity" value="inherit"/>
43+
</module>
44+
<module name="MethodParamPad"/>
45+
<module name="NoWhitespaceAfter"/>
46+
<module name="NoWhitespaceBefore"/>
47+
<module name="OperatorWrap">
48+
<property name="option" value="eol"/>
49+
<property name="tokens"
50+
value="ASSIGN, BAND, BAND_ASSIGN, BOR, BOR_ASSIGN, BSR, BSR_ASSIGN, BXOR, BXOR_ASSIGN, COLON, DIV, DIV_ASSIGN, EQUAL, GE, GT, LAND, LE, LITERAL_INSTANCEOF, LOR, LT, MINUS, MINUS_ASSIGN, MOD, MOD_ASSIGN, NOT_EQUAL, PLUS, PLUS_ASSIGN, QUESTION, SL, SL_ASSIGN, SR, SR_ASSIGN, STAR, STAR_ASSIGN"/>
51+
</module>
52+
<module name="ParenPad"/>
53+
<module name="TypecastParenPad"/>
54+
<module name="WhitespaceAfter">
55+
<property name="tokens" value="COMMA, SEMI, TYPECAST"/>
56+
</module>
57+
<module name="WhitespaceAround">
58+
<property name="tokens"
59+
value="ASSIGN, BAND, BAND_ASSIGN, BOR, BOR_ASSIGN, BSR, BSR_ASSIGN, BXOR, BXOR_ASSIGN, COLON, DIV, DIV_ASSIGN, EQUAL, GE, GT, LAND, LCURLY, LE, LITERAL_ASSERT, LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR, LITERAL_IF, LITERAL_RETURN, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE, LOR, LT, MINUS, MINUS_ASSIGN, MOD, MOD_ASSIGN, NOT_EQUAL, PLUS, PLUS_ASSIGN, QUESTION, RCURLY, SL, SLIST, SL_ASSIGN, SR, SR_ASSIGN, STAR, STAR_ASSIGN, TYPE_EXTENSION_AND, WILDCARD_TYPE"/>
60+
</module>
61+
<module name="ModifierOrder"/>
62+
<module name="RedundantModifier"/>
63+
<module name="EmptyBlock"/>
64+
<module name="LeftCurly">
65+
<property name="option" value="nl"/>
66+
<property name="tokens"
67+
value="CLASS_DEF, CTOR_DEF, INTERFACE_DEF, METHOD_DEF"/>
68+
</module>
69+
<module name="LeftCurly">
70+
<property name="tokens"
71+
value="LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR, LITERAL_IF, LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE"/>
72+
</module>
73+
<module name="RightCurly">
74+
<property name="option" value="alone"/>
75+
<property name="tokens"
76+
value="LITERAL_CATCH, LITERAL_FINALLY, LITERAL_ELSE"/>
77+
</module>
78+
<module name="AvoidNestedBlocks">
79+
<property name="allowInSwitchCase" value="true"/>
80+
</module>
81+
<module name="CovariantEquals"/>
82+
<module name="EmptyStatement"/>
83+
<module name="EqualsHashCode"/>
84+
<module name="HiddenField"/>
85+
<module name="InnerAssignment"/>
86+
<module name="MissingSwitchDefault"/>
87+
<module name="ModifiedControlVariable"/>
88+
<module name="SimplifyBooleanExpression"/>
89+
<module name="SimplifyBooleanReturn"/>
90+
<module name="StringLiteralEquality"/>
91+
<module name="SuperClone"/>
92+
<module name="SuperFinalize"/>
93+
<module name="IllegalCatch"/>
94+
<module name="IllegalThrows"/>
95+
<module name="PackageDeclaration"/>
96+
<module name="DeclarationOrder"/>
97+
<module name="ParameterAssignment"/>
98+
<module name="ExplicitInitialization"/>
99+
<module name="DefaultComesLast"/>
100+
<module name="FallThrough"/>
101+
<module name="MultipleVariableDeclarations"/>
102+
<module name="UnnecessaryParentheses"/>
103+
<module name="VisibilityModifier"/>
104+
<module name="FinalClass"/>
105+
<module name="HideUtilityClassConstructor"/>
106+
<module name="DesignForExtension"/>
107+
<module name="MutableException"/>
108+
<module name="ClassFanOutComplexity"/>
109+
<module name="CyclomaticComplexity">
110+
<property name="max" value="20"/>
111+
</module>
112+
<module name="NPathComplexity"/>
113+
<module name="RegexpSinglelineJava">
114+
<property name="format" value="\s+$"/>
115+
<property name="message" value="Line has trailing spaces."/>
116+
</module>
117+
<module name="TodoComment">
118+
<property name="format" value="TODO"/>
119+
</module>
120+
<module name="UncommentedMain"/>
121+
<module name="UpperEll"/>
122+
<module name="ArrayTypeStyle"/>
123+
<module name="Indentation">
124+
<property name="basicOffset" value="2"/>
125+
<property name="caseIndent" value="2"/>
126+
</module>
127+
<module name="TrailingComment"/>
128+
</module>
129+
<module name="JavadocPackage"/>
130+
<module name="NewlineAtEndOfFile"/>
131+
<module name="Translation"/>
126132
</module>

pom.xml

+13-6
Original file line numberDiff line numberDiff line change
@@ -106,14 +106,21 @@
106106
</dependency>
107107

108108
<dependency>
109-
<groupId>org.apache.spark</groupId>
110-
<artifactId>spark-core_2.11</artifactId>
111-
<version>2.0.1</version>
109+
<groupId>org.apache.spark</groupId>
110+
<artifactId>spark-core_2.11</artifactId>
111+
<version>2.0.1</version>
112112
</dependency>
113+
114+
<dependency>
115+
<groupId>org.apache.hadoop</groupId>
116+
<artifactId>hadoop-client</artifactId>
117+
<version>2.2.0</version>
118+
</dependency>
119+
113120
<dependency>
114-
<groupId>org.apache.hadoop</groupId>
115-
<artifactId>hadoop-client</artifactId>
116-
<version>2.2.0</version>
121+
<groupId>log4j</groupId>
122+
<artifactId>log4j</artifactId>
123+
<version>1.2.17</version>
117124
</dependency>
118125
</dependencies>
119126
</project>

src/main/java/general/Main.java

+5-6
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,16 @@
1919
* #L%
2020
*/
2121

22-
import java.io.FileNotFoundException;
23-
2422
import input.InputHandler;
2523
import logging.LoggingHandler;
26-
2724
import org.apache.log4j.Logger;
28-
2925
import output.OutputHandler;
3026
import query.JenaQueryHandler;
3127
import query.OpenRDFQueryHandler;
3228
import query.QueryHandler;
3329

30+
import java.io.FileNotFoundException;
31+
3432

3533
/**
3634
* @author jgonsior
@@ -65,9 +63,10 @@ public static void main(String[] args)
6563
if (argument.equals("-jena")) queryHandler = new JenaQueryHandler();
6664
if (argument.equals("-openrdf")) queryHandler = new OpenRDFQueryHandler();
6765
}
66+
6867
LoggingHandler.initConsoleLog();
6968

70-
for (int i = 1; i <= 30; i++) {
69+
for (int i = 1; i <= 1; i++) {
7170
String inputFile = "QueryCnt" + String.format("%02d", i) + ".tsv";
7271
String outputFile = "QueryProcessedSept" +
7372
String.format("%02d", i) + ".tsv";
@@ -89,4 +88,4 @@ public static void main(String[] args)
8988
}
9089
}
9190
}
92-
}
91+
}

src/main/java/input/InputHandler.java

+23-20
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,17 @@
11
package input;
22

3-
import java.io.FileInputStream;
4-
import java.io.FileNotFoundException;
5-
import java.io.InputStreamReader;
6-
import java.io.Reader;
7-
import java.net.URI;
8-
import java.net.URISyntaxException;
9-
import java.util.List;
10-
113
import com.univocity.parsers.common.ParsingContext;
124
import com.univocity.parsers.common.processor.ObjectRowProcessor;
135
import com.univocity.parsers.tsv.TsvParser;
146
import com.univocity.parsers.tsv.TsvParserSettings;
15-
import org.apache.http.NameValuePair;
16-
import org.apache.http.client.utils.URLEncodedUtils;
177
import org.apache.log4j.Logger;
188
import output.OutputHandler;
199

10+
import java.io.*;
11+
import java.net.MalformedURLException;
12+
import java.net.URL;
13+
import java.net.URLDecoder;
14+
2015
/**
2116
* @author adrian
2217
*/
@@ -65,20 +60,28 @@ public final void parseTo(final OutputHandler outputHandler)
6560
@Override
6661
public void rowProcessed(Object[] row, ParsingContext parsingContext)
6762
{
68-
String queryString = null;
63+
String queryString = "";
6964
try {
70-
//parse url
71-
List<NameValuePair> params = URLEncodedUtils.parse(
72-
new URI((String) row[0]), "UTF-8");
65+
// the url needs to be transformed first into a URL and then later into a URI because the charachter ^
66+
// which is included in some Queries is apparently an illegal charachter which needs to be encoded
67+
// differently (which the creation of a URL object first is dealing with)
68+
URL url = new URL("https://query.wikidata.org/" + row[0]);
7369

74-
//find out the query parameter
75-
for (NameValuePair param : params) {
76-
if (param.getName().equals("query")) {
77-
queryString = param.getValue();
70+
//parse url
71+
String[] pairs = url.getQuery().split("&");
72+
for (String pair : pairs) {
73+
int idx = pair.indexOf("=");
74+
String key = idx > 0 ? URLDecoder.decode(pair.substring(0, idx), "UTF-8") : pair;
75+
if (key.equals("query")) {
76+
//find out the query parameter
77+
queryString = idx > 0 && pair.length() > idx + 1 ? URLDecoder.decode(pair.substring(idx + 1), "UTF-8") : null;
7878
}
7979
}
80-
} catch (URISyntaxException e) {
81-
logger.warn("There was a syntax error in the following URI: " + row[0] + " /nFound at " + inputFile + ", line " + parsingContext.currentLine());
80+
81+
} catch (MalformedURLException e) {
82+
logger.error("There was a syntax error in the following URL: " + row[0] + " /nFound at " + inputFile + ", line " + parsingContext.currentLine() + "\n" + e.getMessage());
83+
} catch (UnsupportedEncodingException e) {
84+
logger.error("Your system apperently doesn't supports UTF-8 encoding. Please fix this before running this software again.");
8285
}
8386
outputHandler.writeLine(queryString, row, parsingContext.currentLine(), inputFile);
8487
}

src/main/java/input/SparkHadoopExperiment.java

-2
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
11
package input;
22

33
import logging.LoggingHandler;
4-
54
import org.apache.log4j.Level;
65
import org.apache.spark.SparkConf;
76
import org.apache.spark.api.java.JavaSparkContext;
87

98
/**
109
* @author adrian
11-
*
1210
*/
1311
public final class SparkHadoopExperiment
1412
{

0 commit comments

Comments
 (0)