1
+ /*
2
+ * Licensed to Elasticsearch B.V. under one or more contributor
3
+ * license agreements. See the NOTICE file distributed with
4
+ * this work for additional information regarding copyright
5
+ * ownership. Elasticsearch B.V. licenses this file to you under
6
+ * the Apache License, Version 2.0 (the "License"); you may
7
+ * not use this file except in compliance with the License.
8
+ * You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ package org .logstash .common ;
21
+
22
+ import org .jruby .RubyArray ;
23
+ import org .jruby .RubyEncoding ;
24
+ import org .jruby .RubyString ;
25
+ import org .jruby .runtime .ThreadContext ;
26
+ import org .jruby .runtime .builtin .IRubyObject ;
27
+ import org .junit .Before ;
28
+ import org .junit .Test ;
29
+ import org .logstash .RubyTestBase ;
30
+ import org .logstash .RubyUtil ;
31
+
32
+ import java .util .List ;
33
+
34
+ import static org .junit .Assert .assertEquals ;
35
+ import static org .junit .Assert .assertTrue ;
36
+ import static org .logstash .RubyUtil .RUBY ;
37
+
38
+ @ SuppressWarnings ("unchecked" )
39
+ public final class BufferedTokenizerExtTest extends RubyTestBase {
40
+
41
+ private BufferedTokenizerExt sut ;
42
+ private ThreadContext context ;
43
+
44
+ @ Before
45
+ public void setUp () {
46
+ sut = new BufferedTokenizerExt (RubyUtil .RUBY , RubyUtil .BUFFERED_TOKENIZER );
47
+ context = RUBY .getCurrentContext ();
48
+ IRubyObject [] args = {};
49
+ sut .init (context , args );
50
+ }
51
+
52
+ @ Test
53
+ public void shouldTokenizeASingleToken () {
54
+ RubyArray <RubyString > tokens = (RubyArray <RubyString >) sut .extract (context , RubyUtil .RUBY .newString ("foo\n " ));
55
+
56
+ assertEquals (List .of ("foo" ), tokens );
57
+ }
58
+
59
+ @ Test
60
+ public void shouldMergeMultipleToken () {
61
+ RubyArray <RubyString > tokens = (RubyArray <RubyString >) sut .extract (context , RubyUtil .RUBY .newString ("foo" ));
62
+ assertTrue (tokens .isEmpty ());
63
+
64
+ tokens = (RubyArray <RubyString >) sut .extract (context , RubyUtil .RUBY .newString ("bar\n " ));
65
+ assertEquals (List .of ("foobar" ), tokens );
66
+ }
67
+
68
+ @ Test
69
+ public void shouldTokenizeMultipleToken () {
70
+ RubyArray <RubyString > tokens = (RubyArray <RubyString >) sut .extract (context , RubyUtil .RUBY .newString ("foo\n bar\n " ));
71
+
72
+ assertEquals (List .of ("foo" , "bar" ), tokens );
73
+ }
74
+
75
+ @ Test
76
+ public void shouldIgnoreEmptyPayload () {
77
+ RubyArray <RubyString > tokens = (RubyArray <RubyString >) sut .extract (context , RubyUtil .RUBY .newString ("" ));
78
+ assertTrue (tokens .isEmpty ());
79
+
80
+ tokens = (RubyArray <RubyString >) sut .extract (context , RubyUtil .RUBY .newString ("foo\n bar" ));
81
+ assertEquals (List .of ("foo" ), tokens );
82
+ }
83
+
84
+ @ Test
85
+ public void shouldTokenizeEmptyPayloadWithNewline () {
86
+ RubyArray <RubyString > tokens = (RubyArray <RubyString >) sut .extract (context , RubyUtil .RUBY .newString ("\n " ));
87
+ assertEquals (List .of ("" ), tokens );
88
+
89
+ tokens = (RubyArray <RubyString >) sut .extract (context , RubyUtil .RUBY .newString ("\n \n \n " ));
90
+ assertEquals (List .of ("" , "" , "" ), tokens );
91
+ }
92
+
93
+ @ Test
94
+ public void shouldNotChangeEncodingOfTokensAfterPartitioning () {
95
+ RubyString rubyString = RubyString .newString (RUBY , new byte []{(byte ) 0xA3 , 0x0A , 0x41 }); // £ character, newline, A
96
+ IRubyObject rubyInput = rubyString .force_encoding (context , RUBY .newString ("ISO8859-1" ));
97
+ RubyArray <RubyString > tokens = (RubyArray <RubyString >)sut .extract (context , rubyInput );
98
+
99
+ // read the first token, the £ string
100
+ IRubyObject firstToken = tokens .shift (context );
101
+ assertEquals ("£" , firstToken .toString ());
102
+
103
+ // verify encoding "ISO8859-1" is preserved in the Java to Ruby String conversion
104
+ RubyEncoding encoding = (RubyEncoding ) firstToken .callMethod (context , "encoding" );
105
+ assertEquals ("ISO-8859-1" , encoding .toString ());
106
+ }
107
+
108
+ @ Test
109
+ public void shouldNotChangeEncodingOfTokensAfterPartitioningInCaseMultipleExtractionInInvoked () {
110
+ RubyString rubyString = RubyString .newString (RUBY , new byte []{(byte ) 0xA3 }); // £ character
111
+ IRubyObject rubyInput = rubyString .force_encoding (context , RUBY .newString ("ISO8859-1" ));
112
+ sut .extract (context , rubyInput );
113
+ IRubyObject capitalAInLatin1 = RubyString .newString (RUBY , new byte []{(byte ) 0x41 })
114
+ .force_encoding (context , RUBY .newString ("ISO8859-1" ));
115
+ RubyArray <RubyString > tokens = (RubyArray <RubyString >)sut .extract (context , capitalAInLatin1 );
116
+ assertTrue (tokens .isEmpty ());
117
+
118
+ tokens = (RubyArray <RubyString >)sut .extract (context , RubyString .newString (RUBY , new byte []{(byte ) 0x0A }));
119
+
120
+ // read the first token, the £ string
121
+ IRubyObject firstToken = tokens .shift (context );
122
+ assertEquals ("£A" , firstToken .toString ());
123
+
124
+ // verify encoding "ISO8859-1" is preserved in the Java to Ruby String conversion
125
+ RubyEncoding encoding = (RubyEncoding ) firstToken .callMethod (context , "encoding" );
126
+ assertEquals ("ISO-8859-1" , encoding .toString ());
127
+ }
128
+
129
+ @ Test
130
+ public void shouldNotChangeEncodingOfTokensAfterPartitioningWhenRetrieveLastFlushedToken () {
131
+ RubyString rubyString = RubyString .newString (RUBY , new byte []{(byte ) 0xA3 , 0x0A , 0x41 }); // £ character, newline, A
132
+ IRubyObject rubyInput = rubyString .force_encoding (context , RUBY .newString ("ISO8859-1" ));
133
+ RubyArray <RubyString > tokens = (RubyArray <RubyString >)sut .extract (context , rubyInput );
134
+
135
+ // read the first token, the £ string
136
+ IRubyObject firstToken = tokens .shift (context );
137
+ assertEquals ("£" , firstToken .toString ());
138
+
139
+ // flush and check that the remaining A is still encoded in ISO8859-1
140
+ IRubyObject lastToken = sut .flush (context );
141
+ assertEquals ("A" , lastToken .toString ());
142
+
143
+ // verify encoding "ISO8859-1" is preserved in the Java to Ruby String conversion
144
+ RubyEncoding encoding = (RubyEncoding ) lastToken .callMethod (context , "encoding" );
145
+ assertEquals ("ISO-8859-1" , encoding .toString ());
146
+ }
147
+
148
+ @ Test
149
+ public void givenDirectFlushInvocationUTF8EncodingIsApplied () {
150
+ RubyString rubyString = RubyString .newString (RUBY , new byte []{(byte ) 0xA3 , 0x41 }); // £ character, A
151
+ IRubyObject rubyInput = rubyString .force_encoding (context , RUBY .newString ("ISO8859-1" ));
152
+
153
+ // flush and check that the remaining A is still encoded in ISO8859-1
154
+ IRubyObject lastToken = sut .flush (context );
155
+ assertEquals ("" , lastToken .toString ());
156
+
157
+ // verify encoding "ISO8859-1" is preserved in the Java to Ruby String conversion
158
+ RubyEncoding encoding = (RubyEncoding ) lastToken .callMethod (context , "encoding" );
159
+ assertEquals ("UTF-8" , encoding .toString ());
160
+ }
161
+ }
0 commit comments