aboutsummaryrefslogtreecommitdiffstats
path: root/activesupport/test/multibyte_handler_test.rb
blob: 5575ecc32d4dc0f0c55d53162cd9eb200e15490e (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
# encoding: utf-8
require 'abstract_unit'

if RUBY_VERSION < '1.9'

$KCODE = 'UTF8'

class String
  # Unicode Inspect returns the codepoints of the string in hex
  def ui
    "#{self} " + ("[%s]" % unpack("U*").map{|cp| cp.to_s(16) }.join(' '))
  end unless ''.respond_to?(:ui)
end

module UTF8HandlingTest
  
  def common_setup
    # This is an ASCII string with some russian strings and a ligature. It's nicely calibrated, because
    # slicing it at some specific bytes will kill your characters if you use standard Ruby routines.
    # It has both capital and standard letters, so that we can test case conversions easily.
    # It has 26 characters and 28 when the ligature gets split during normalization.
    @string =     "Abcd Блå ffi бла бла бла бла"
    @string_kd =  "Abcd Блå ffi бла бла бла бла"
    @string_kc =  "Abcd Блå ffi бла бла бла бла"
    @string_c =   "Abcd Блå ffi бла бла бла бла"
    @string_d =   "Abcd Блå ffi бла бла бла бла"
    @bytestring = "\270\236\010\210\245" # Not UTF-8
    
    # Characters from the character classes as described in UAX #29
    @character_from_class = {
      :l => 0x1100, :v => 0x1160, :t => 0x11A8, :lv => 0xAC00, :lvt => 0xAC01, :cr => 0x000D, :lf => 0x000A,
      :extend => 0x094D, :n => 0x64
    }
  end
  
  def test_utf8_recognition
    assert ActiveSupport::Multibyte::Handlers::UTF8Handler.consumes?(@string),
      "Should recognize as a valid UTF-8 string"
    assert !ActiveSupport::Multibyte::Handlers::UTF8Handler.consumes?(@bytestring), "This is bytestring, not UTF-8"
  end
  
  def test_simple_normalization
    # Normalization of DEVANAGARI LETTER QA breaks when composition exclusion isn't used correctly
    assert_equal [0x915, 0x93c].pack('U*').ui, [0x915, 0x93c].pack('U*').chars.normalize(:c).to_s.ui
    
    null_byte_str = "Test\0test"
    
    assert_equal '', @handler.normalize(''), "Empty string should not break things"
    assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :kc).ui, "Null byte should remain"
    assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :c).ui, "Null byte should remain" 
    assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :d).ui, "Null byte should remain"
    assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :kd).ui, "Null byte should remain"
    assert_equal null_byte_str.ui, @handler.decompose(null_byte_str).ui, "Null byte should remain"
    assert_equal null_byte_str.ui, @handler.compose(null_byte_str).ui, "Null byte should remain" 
    
    comp_str = [
      44,  # LATIN CAPITAL LETTER D
      307, # COMBINING DOT ABOVE
      328, # COMBINING OGONEK
      323 # COMBINING DOT BELOW
    ].pack("U*")
    norm_str_KC = [44,105,106,328,323].pack("U*")
    norm_str_C = [44,307,328,323].pack("U*")
    norm_str_D = [44,307,110,780,78,769].pack("U*")
    norm_str_KD = [44,105,106,110,780,78,769].pack("U*")
    
    assert_equal norm_str_KC.ui, @handler.normalize(comp_str, :kc).ui, "Should normalize KC"
    assert_equal norm_str_C.ui, @handler.normalize(comp_str, :c).ui, "Should normalize C"
    assert_equal norm_str_D.ui, @handler.normalize(comp_str, :d).ui, "Should normalize D"
    assert_equal norm_str_KD.ui, @handler.normalize(comp_str, :kd).ui, "Should normalize KD"
    
    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.normalize(@bytestring) }
  end
  
  # Test for the Public Review Issue #29, bad explanation of composition might lead to a
  # bad implementation: http://www.unicode.org/review/pr-29.html
  def test_normalization_C_pri_29
    [
      [0x0B47, 0x0300, 0x0B3E],
      [0x1100, 0x0300, 0x1161]
    ].map { |c| c.pack('U*') }.each do |c|
      assert_equal c.ui, @handler.normalize(c, :c).ui, "Composition is implemented incorrectly"
    end
  end
  
  def test_casefolding
    simple_str = "abCdef"
    simple_str_upcase = "ABCDEF"
    simple_str_downcase = "abcdef"
    
    assert_equal '', @handler.downcase(@handler.upcase('')), "Empty string should not break things"
    assert_equal simple_str_upcase, @handler.upcase(simple_str), "should upcase properly"
    assert_equal simple_str_downcase, @handler.downcase(simple_str), "should downcase properly"
    assert_equal simple_str_downcase, @handler.downcase(@handler.upcase(simple_str_downcase)), "upcase and downcase should be mirrors"
    
    rus_str = "аБвгд\0f"
    rus_str_upcase = "АБВГД\0F"
    rus_str_downcase = "абвгд\0f"
    assert_equal rus_str_upcase, @handler.upcase(rus_str), "should upcase properly honoring null-byte"
    assert_equal rus_str_downcase, @handler.downcase(rus_str), "should downcase properly honoring null-byte"
    
    jap_str = "の埋め込み化対応はほぼ完成"
    assert_equal jap_str, @handler.upcase(jap_str), "Japanse has no upcase, should remain unchanged"
    assert_equal jap_str, @handler.downcase(jap_str), "Japanse has no downcase, should remain unchanged"
    
    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.upcase(@bytestring) }
  end
  
  def test_capitalize
    { 'аБвг аБвг' => 'Абвг абвг',
      'аБвг АБВГ' => 'Абвг абвг',
      'АБВГ АБВГ' => 'Абвг абвг',
      '' => '' }.each do |f,t|
        assert_equal t, @handler.capitalize(f), "Capitalize should work as expected"
    end
    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.capitalize(@bytestring) }
  end
  
  def test_translate_offset
    str = "Блaå" # [2, 2, 1, 2] bytes
    assert_equal 0, @handler.translate_offset('', 0), "Offset for an empty string makes no sense, return 0"
    assert_equal 0, @handler.translate_offset(str, 0), "First character, first byte"
    assert_equal 0, @handler.translate_offset(str, 1), "First character, second byte"
    assert_equal 1, @handler.translate_offset(str, 2), "Second character, third byte"
    assert_equal 1, @handler.translate_offset(str, 3), "Second character, fourth byte"
    assert_equal 2, @handler.translate_offset(str, 4), "Third character, fifth byte"
    assert_equal 3, @handler.translate_offset(str, 5), "Fourth character, sixth byte"
    assert_equal 3, @handler.translate_offset(str, 6), "Fourth character, seventh byte"
    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.translate_offset(@bytestring, 3) }
  end
  
  def test_insert
    assert_equal '', @handler.insert('', 0, ''), "Empty string should not break things"
    assert_equal "Abcd Блå ffiБУМ бла бла бла бла", @handler.insert(@string, 10, "БУМ"), 
      "Text should be inserted at right codepoints"
    assert_equal "Abcd Блå ffiБУМ бла бла бла бла", @string, "Insert should be destructive"
    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) do
      @handler.insert(@bytestring, 2, "\210")
    end
  end
  
  def test_reverse
    str = "wБлåa \n"
    rev = "\n aåлБw"
    assert_equal '', @handler.reverse(''), "Empty string shouldn't change"
    assert_equal rev.ui, @handler.reverse(str).ui, "Should reverse properly"
    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.reverse(@bytestring) }
  end
  
  def test_size
    assert_equal 0, @handler.size(''), "Empty string has size 0"
    assert_equal 26, @handler.size(@string), "String length should be 26"
    assert_equal 26, @handler.length(@string), "String length method should be properly aliased"
    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.size(@bytestring) }
  end
  
  def test_slice
    assert_equal 0x41, @handler.slice(@string, 0), "Singular characters should return codepoints"
    assert_equal 0xE5, @handler.slice(@string, 7), "Singular characters should return codepoints"
    assert_equal nil, @handler.slice('', -1..1), "Broken range should return nil"
    assert_equal '', @handler.slice('', 0..10), "Empty string should not break things"
    assert_equal "d Блå ffi", @handler.slice(@string, 3..9), "Unicode characters have to be returned"
    assert_equal "d Блå ffi", @handler.slice(@string, 3, 7), "Unicode characters have to be returned"
    assert_equal "A", @handler.slice(@string, 0, 1), "Slicing from an offset should return characters"
    assert_equal " Блå ffi ", @handler.slice(@string, 4..10), "Unicode characters have to be returned"
    assert_equal "ffi бла", @handler.slice(@string, /ffi бла/u), "Slicing on Regexps should be supported"
    assert_equal "ffi бла", @handler.slice(@string, /ffi \w\wа/u), "Slicing on Regexps should be supported"
    assert_equal nil, @handler.slice(@string, /unknown/u), "Slicing on Regexps with no match should return nil"
    assert_equal "ffi бла", @handler.slice(@string, /(ffi бла)/u,1), "Slicing on Regexps with a match group should be supported"
    assert_equal nil, @handler.slice(@string, /(ffi)/u,2), "Slicing with a Regexp and asking for an invalid match group should return nil"
    assert_equal "", @handler.slice(@string, 7..6), "Range is empty, should return an empty string"
    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.slice(@bytestring, 2..3) }
    assert_raise(TypeError, "With 2 args, should raise TypeError for non-Numeric or Regexp first argument") { @handler.slice(@string, 2..3, 1) }
    assert_raise(TypeError, "With 2 args, should raise TypeError for non-Numeric or Regexp second argument") { @handler.slice(@string, 1, 2..3) }
    assert_raise(ArgumentError, "Should raise ArgumentError when there are more than 2 args") { @handler.slice(@string, 1, 1, 1) }
  end
  
  def test_grapheme_cluster_length
    assert_equal 0, @handler.g_length(''), "String should count 0 grapheme clusters"
    assert_equal 2, @handler.g_length([0x0924, 0x094D, 0x0930].pack('U*')), "String should count 2 grapheme clusters"
    assert_equal 1, @handler.g_length(string_from_classes(%w(cr lf))), "Don't cut between CR and LF"
    assert_equal 1, @handler.g_length(string_from_classes(%w(l l))), "Don't cut between L"
    assert_equal 1, @handler.g_length(string_from_classes(%w(l v))), "Don't cut between L and V"
    assert_equal 1, @handler.g_length(string_from_classes(%w(l lv))), "Don't cut between L and LV"
    assert_equal 1, @handler.g_length(string_from_classes(%w(l lvt))), "Don't cut between L and LVT"
    assert_equal 1, @handler.g_length(string_from_classes(%w(lv v))), "Don't cut between LV and V"
    assert_equal 1, @handler.g_length(string_from_classes(%w(lv t))), "Don't cut between LV and T"
    assert_equal 1, @handler.g_length(string_from_classes(%w(v v))), "Don't cut between V and V"
    assert_equal 1, @handler.g_length(string_from_classes(%w(v t))), "Don't cut between V and T"
    assert_equal 1, @handler.g_length(string_from_classes(%w(lvt t))), "Don't cut between LVT and T"
    assert_equal 1, @handler.g_length(string_from_classes(%w(t t))), "Don't cut between T and T"
    assert_equal 1, @handler.g_length(string_from_classes(%w(n extend))), "Don't cut before Extend"
    assert_equal 2, @handler.g_length(string_from_classes(%w(n n))), "Cut between normal characters"
    assert_equal 3, @handler.g_length(string_from_classes(%w(n cr lf n))), "Don't cut between CR and LF"
    assert_equal 2, @handler.g_length(string_from_classes(%w(n l v t))), "Don't cut between L, V and T"
    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.g_length(@bytestring) }
  end
  
  def test_index
     s = "Καλημέρα κόσμε!"
     assert_equal 0, @handler.index('', ''), "The empty string is always found at the beginning of the string"
     assert_equal 0, @handler.index('haystack', ''), "The empty string is always found at the beginning of the string"
     assert_equal 0, @handler.index(s, 'Κ'), "Greek K is at 0"
     assert_equal 1, @handler.index(s, 'α'), "Greek Alpha is at 1"
     
     assert_equal nil, @handler.index(@bytestring, 'a')
     assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.index(@bytestring, "\010") }
  end
  
  def test_indexed_insert
    s = "Καλη!"
    @handler[s, 2] = "a"
    assert_equal "Καaη!", s
    @handler[s, 2] = "ηη"
    assert_equal "Καηηη!", s
    assert_raises(IndexError) { @handler[s, 10] = 'a' }
    assert_equal "Καηηη!", s
    @handler[s, 2] = 32
    assert_equal "Κα ηη!", s
    @handler[s, 3, 2] = "λλλ"
    assert_equal "Κα λλλ!", s
    @handler[s, 1, 0] = "λ"
    assert_equal "Κλα λλλ!", s
    assert_raises(IndexError) { @handler[s, 10, 4] = 'a' }
    assert_equal "Κλα λλλ!", s
    @handler[s, 4..6] = "ηη"
    assert_equal "Κλα ηη!", s
    assert_raises(RangeError) { @handler[s, 10..12] = 'a' }
    assert_equal "Κλα ηη!", s
    @handler[s, /ηη/] = "λλλ"
    assert_equal "Κλα λλλ!", s
    assert_raises(IndexError) { @handler[s, /ii/] = 'a' }
    assert_equal "Κλα λλλ!", s
    @handler[s, /(λλ)(.)/, 2] = "α"
    assert_equal "Κλα λλα!", s
    assert_raises(IndexError) { @handler[s, /()/, 10] = 'a' }
    assert_equal "Κλα λλα!", s
    @handler[s, "α"] = "η"
    assert_equal "Κλη λλα!", s
    @handler[s, "λλ"] = "ααα"
    assert_equal "Κλη αααα!", s
  end
  
  def test_rjust
    s = "Καη"
    assert_raises(ArgumentError) { @handler.rjust(s, 10, '') }
    assert_raises(ArgumentError) { @handler.rjust(s) }
    assert_equal "Καη", @handler.rjust(s, -3)
    assert_equal "Καη", @handler.rjust(s, 0)
    assert_equal "Καη", @handler.rjust(s, 3)
    assert_equal "  Καη", @handler.rjust(s, 5)
    assert_equal "    Καη", @handler.rjust(s, 7)
    assert_equal "----Καη", @handler.rjust(s, 7, '-')
    assert_equal "ααααΚαη", @handler.rjust(s, 7, 'α')
    assert_equal "abaΚαη", @handler.rjust(s, 6, 'ab')
    assert_equal "αηαΚαη", @handler.rjust(s, 6, 'αη')
  end
  
  def test_ljust
    s = "Καη"
    assert_raises(ArgumentError) { @handler.ljust(s, 10, '') }
    assert_raises(ArgumentError) { @handler.ljust(s) }
    assert_equal "Καη", @handler.ljust(s, -3)
    assert_equal "Καη", @handler.ljust(s, 0)
    assert_equal "Καη", @handler.ljust(s, 3)
    assert_equal "Καη  ", @handler.ljust(s, 5)
    assert_equal "Καη    ", @handler.ljust(s, 7)
    assert_equal "Καη----", @handler.ljust(s, 7, '-')
    assert_equal "Καηαααα", @handler.ljust(s, 7, 'α')
    assert_equal "Καηaba", @handler.ljust(s, 6, 'ab')
    assert_equal "Καηαηα", @handler.ljust(s, 6, 'αη')
  end
  
  def test_center
    s = "Καη"
    assert_raises(ArgumentError) { @handler.center(s, 10, '') }
    assert_raises(ArgumentError) { @handler.center(s) }
    assert_equal "Καη", @handler.center(s, -3)
    assert_equal "Καη", @handler.center(s, 0)
    assert_equal "Καη", @handler.center(s, 3)
    assert_equal "Καη ", @handler.center(s, 4)
    assert_equal " Καη ", @handler.center(s, 5)
    assert_equal " Καη  ", @handler.center(s, 6)
    assert_equal "--Καη--", @handler.center(s, 7, '-')
    assert_equal "--Καη---", @handler.center(s, 8, '-')
    assert_equal "ααΚαηαα", @handler.center(s, 7, 'α')
    assert_equal "ααΚαηααα", @handler.center(s, 8, 'α')
    assert_equal "aΚαηab", @handler.center(s, 6, 'ab')
    assert_equal "abΚαηab", @handler.center(s, 7, 'ab')
    assert_equal "ababΚαηabab", @handler.center(s, 11, 'ab')
    assert_equal "αΚαηαη", @handler.center(s, 6, 'αη')
    assert_equal "αηΚαηαη", @handler.center(s, 7, 'αη')
  end
  
  def test_strip
    # A unicode aware version of strip should strip all 26 types of whitespace. This includes the NO BREAK SPACE
    # aka BOM (byte order mark). The byte order mark has no place in UTF-8 because it's used to detect LE and BE.
    b = "\n" + [
      32, # SPACE
      8195, # EM SPACE
      8199, # FIGURE SPACE,
      8201, # THIN SPACE
      8202, # HAIR SPACE
      65279, # NO BREAK SPACE (ZW)
    ].pack('U*')
    m = "word блин\n\n\n  word"
    e = [
    65279, # NO BREAK SPACE (ZW)
    8201, # THIN SPACE
    8199, # FIGURE SPACE,      
    32, # SPACE
    ].pack('U*')
    string = b+m+e
    
    assert_equal '', @handler.strip(''), "Empty string should stay empty"
    assert_equal m+e, @handler.lstrip(string), "Whitespace should be gone on the left"
    assert_equal b+m, @handler.rstrip(string), "Whitespace should be gone on the right"
    assert_equal m, @handler.strip(string), "Whitespace should be stripped on both sides"
    
    bs = "\n   #{@bytestring} \n\n"
    assert_equal @bytestring, @handler.strip(bs), "Invalid unicode strings should still strip"
  end
  
  def test_tidy_bytes
    result = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].pack('U*')
    assert_equal result, @handler.tidy_bytes(@bytestring)
    assert_equal "a#{result}a", @handler.tidy_bytes('a' + @bytestring + 'a'),
      'tidy_bytes should leave surrounding characters intact'
    assert_equal #{result}é", @handler.tidy_bytes('é' + @bytestring + 'é'),
      'tidy_bytes should leave surrounding characters intact'
    assert_nothing_raised { @handler.tidy_bytes(@bytestring).unpack('U*') }
    
    assert_equal "\xC3\xA7", @handler.tidy_bytes("\xE7") # iso_8859_1: small c cedilla
    assert_equal "\xC2\xA9", @handler.tidy_bytes("\xA9") # iso_8859_1: copyright symbol
    assert_equal "\xE2\x80\x9C", @handler.tidy_bytes("\x93") # win_1252: left smart quote
    assert_equal "\xE2\x82\xAC", @handler.tidy_bytes("\x80") # win_1252: euro
    assert_equal "\x00", @handler.tidy_bytes("\x00") # null char
    assert_equal [0xfffd].pack('U'), @handler.tidy_bytes("\xef\xbf\xbd") # invalid char
  end
  
  protected
  
  def string_from_classes(classes)
    classes.collect do |k|
      @character_from_class[k.intern]
    end.pack('U*')
  end
end


begin
  require_library_or_gem('utf8proc_native')
  require 'active_record/multibyte/handlers/utf8_handler_proc'
  class UTF8HandlingTestProc < Test::Unit::TestCase
    include UTF8HandlingTest
    def setup
      common_setup
      @handler = ::ActiveSupport::Multibyte::Handlers::UTF8HandlerProc
    end
  end
rescue LoadError
end

class UTF8HandlingTestPure < Test::Unit::TestCase
  include UTF8HandlingTest
  def setup
    common_setup
    @handler = ::ActiveSupport::Multibyte::Handlers::UTF8Handler
  end
end

end