1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
|
# encoding: utf-8
require 'abstract_unit'
require 'multibyte_test_helpers'
require 'active_support/core_ext/string/multibyte'
class String
def __method_for_multibyte_testing_with_integer_result; 1; end
def __method_for_multibyte_testing; 'result'; end
def __method_for_multibyte_testing!; 'result'; end
def __method_for_multibyte_testing_that_returns_nil!; end
end
class MultibyteCharsTest < Test::Unit::TestCase
include MultibyteTestHelpers
def setup
@proxy_class = ActiveSupport::Multibyte::Chars
@chars = @proxy_class.new UNICODE_STRING
end
def test_wraps_the_original_string
assert_equal UNICODE_STRING, @chars.to_s
assert_equal UNICODE_STRING, @chars.wrapped_string
end
def test_should_allow_method_calls_to_string
assert_nothing_raised do
@chars.__method_for_multibyte_testing
end
assert_raise NoMethodError do
@chars.__unknown_method
end
end
def test_forwarded_method_calls_should_return_new_chars_instance
assert_kind_of @proxy_class, @chars.__method_for_multibyte_testing
assert_not_equal @chars.object_id, @chars.__method_for_multibyte_testing.object_id
end
def test_forwarded_bang_method_calls_should_return_the_original_chars_instance_when_result_is_not_nil
assert_kind_of @proxy_class, @chars.__method_for_multibyte_testing!
assert_equal @chars.object_id, @chars.__method_for_multibyte_testing!.object_id
end
def test_forwarded_bang_method_calls_should_return_nil_when_result_is_nil
assert_nil @chars.__method_for_multibyte_testing_that_returns_nil!
end
def test_methods_are_forwarded_to_wrapped_string_for_byte_strings
assert_equal BYTE_STRING.class, BYTE_STRING.mb_chars.class
end
def test_forwarded_method_with_non_string_result_should_be_returned_vertabim
assert_equal ''.__method_for_multibyte_testing_with_integer_result, @chars.__method_for_multibyte_testing_with_integer_result
end
def test_should_concatenate
mb_a = 'a'.mb_chars
mb_b = 'b'.mb_chars
assert_equal 'ab', mb_a + 'b'
assert_equal 'ab', 'a' + mb_b
assert_equal 'ab', mb_a + mb_b
assert_equal 'ab', mb_a << 'b'
assert_equal 'ab', 'a' << mb_b
assert_equal 'abb', mb_a << mb_b
end
def test_consumes_utf8_strings
assert @proxy_class.consumes?(UNICODE_STRING)
assert @proxy_class.consumes?(ASCII_STRING)
assert !@proxy_class.consumes?(BYTE_STRING)
end
def test_unpack_utf8_strings
assert_equal 4, ActiveSupport::Multibyte::Unicode.u_unpack(UNICODE_STRING).length
assert_equal 5, ActiveSupport::Multibyte::Unicode.u_unpack(ASCII_STRING).length
end
def test_unpack_raises_encoding_error_on_broken_strings
assert_raise(ActiveSupport::Multibyte::EncodingError) do
ActiveSupport::Multibyte::Unicode.u_unpack(BYTE_STRING)
end
end
def test_concatenation_should_return_a_proxy_class_instance
assert_equal ActiveSupport::Multibyte.proxy_class, ('a'.mb_chars + 'b').class
assert_equal ActiveSupport::Multibyte.proxy_class, ('a'.mb_chars << 'b').class
end
def test_ascii_strings_are_treated_at_utf8_strings
assert_equal ActiveSupport::Multibyte.proxy_class, ASCII_STRING.mb_chars.class
end
def test_concatenate_should_return_proxy_instance
assert(('a'.mb_chars + 'b').kind_of?(@proxy_class))
assert(('a'.mb_chars + 'b'.mb_chars).kind_of?(@proxy_class))
assert(('a'.mb_chars << 'b').kind_of?(@proxy_class))
assert(('a'.mb_chars << 'b'.mb_chars).kind_of?(@proxy_class))
end
end
class MultibyteCharsUTF8BehaviourTest < Test::Unit::TestCase
include MultibyteTestHelpers
def setup
@chars = UNICODE_STRING.dup.mb_chars
# Ruby 1.9 only supports basic whitespace
@whitespace = "\n\t "
@byte_order_mark = [65279].pack('U')
end
def test_split_should_return_an_array_of_chars_instances
@chars.split(//).each do |character|
assert_kind_of ActiveSupport::Multibyte.proxy_class, character
end
end
%w{capitalize downcase lstrip reverse rstrip upcase}.each do |method|
class_eval(<<-EOTESTS)
def test_#{method}_bang_should_return_self_when_modifying_wrapped_string
chars = ' él piDió Un bUen café '
assert_equal chars.object_id, chars.send("#{method}!").object_id
end
def test_#{method}_bang_should_change_wrapped_string
original = ' él piDió Un bUen café '
proxy = chars(original.dup)
proxy.send("#{method}!")
assert_not_equal original, proxy.to_s
end
EOTESTS
end
def test_tidy_bytes_bang_should_return_self
assert_equal @chars.object_id, @chars.tidy_bytes!.object_id
end
def test_tidy_bytes_bang_should_change_wrapped_string
original = " Un bUen café \x92"
proxy = chars(original.dup)
proxy.tidy_bytes!
assert_not_equal original, proxy.to_s
end
def test_unicode_string_should_have_utf8_encoding
assert_equal Encoding::UTF_8, UNICODE_STRING.encoding
end
def test_identity
assert_equal @chars, @chars
assert @chars.eql?(@chars)
assert !@chars.eql?(UNICODE_STRING)
end
def test_string_methods_are_chainable
assert chars('').insert(0, '').kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').rjust(1).kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').ljust(1).kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').center(1).kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').rstrip.kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').lstrip.kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').strip.kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').reverse.kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars(' ').slice(0).kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').limit(0).kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').upcase.kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').downcase.kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').capitalize.kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').normalize.kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').decompose.kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').compose.kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').tidy_bytes.kind_of?(ActiveSupport::Multibyte.proxy_class)
end
def test_should_be_equal_to_the_wrapped_string
assert_equal UNICODE_STRING, @chars
assert_equal @chars, UNICODE_STRING
end
def test_should_not_be_equal_to_an_other_string
assert_not_equal @chars, 'other'
assert_not_equal 'other', @chars
end
def test_sortability
words = %w(builder armor zebra).sort_by { |s| s.mb_chars }
assert_equal %w(armor builder zebra), words
end
def test_should_return_character_offset_for_regexp_matches
assert_nil(@chars =~ /wrong/u)
assert_equal 0, (@chars =~ /こ/u)
assert_equal 0, (@chars =~ /こに/u)
assert_equal 1, (@chars =~ /に/u)
assert_equal 2, (@chars =~ /ち/u)
assert_equal 3, (@chars =~ /わ/u)
end
def test_should_use_character_offsets_for_insert_offsets
assert_equal '', ''.mb_chars.insert(0, '')
assert_equal 'こわにちわ', @chars.insert(1, 'わ')
assert_equal 'こわわわにちわ', @chars.insert(2, 'わわ')
assert_equal 'わこわわわにちわ', @chars.insert(0, 'わ')
assert_equal 'わこわわわにちわ', @chars.wrapped_string
end
def test_insert_should_be_destructive
@chars.insert(1, 'わ')
assert_equal 'こわにちわ', @chars
end
def test_insert_throws_index_error
assert_raise(IndexError) { @chars.insert(-12, 'わ')}
assert_raise(IndexError) { @chars.insert(12, 'わ') }
end
def test_should_know_if_one_includes_the_other
assert @chars.include?('')
assert @chars.include?('ち')
assert @chars.include?('わ')
assert !@chars.include?('こちわ')
assert !@chars.include?('a')
end
def test_include_raises_when_nil_is_passed
@chars.include?(nil)
flunk "Expected chars.include?(nil) to raise TypeError or NoMethodError"
rescue Exception
end
def test_index_should_return_character_offset
assert_nil @chars.index('u')
assert_equal 0, @chars.index('こに')
assert_equal 2, @chars.index('ち')
assert_equal 2, @chars.index('ち', -2)
assert_equal nil, @chars.index('ち', -1)
assert_equal 3, @chars.index('わ')
assert_equal 5, 'ééxééx'.mb_chars.index('x', 4)
end
def test_rindex_should_return_character_offset
assert_nil @chars.rindex('u')
assert_equal 1, @chars.rindex('に')
assert_equal 2, @chars.rindex('ち', -2)
assert_nil @chars.rindex('ち', -3)
assert_equal 6, 'Café périferôl'.mb_chars.rindex('é')
assert_equal 13, 'Café périferôl'.mb_chars.rindex(/\w/u)
end
def test_indexed_insert_should_take_character_offsets
@chars[2] = 'a'
assert_equal 'こにaわ', @chars
@chars[2] = 'ηη'
assert_equal 'こにηηわ', @chars
@chars[3, 2] = 'λλλ'
assert_equal 'こにηλλλ', @chars
@chars[1, 0] = "λ"
assert_equal 'こλにηλλλ', @chars
@chars[4..6] = "ηη"
assert_equal 'こλにηηη', @chars
@chars[/ηη/] = "λλλ"
assert_equal 'こλにλλλη', @chars
@chars[/(λλ)(.)/, 2] = "α"
assert_equal 'こλにλλαη', @chars
@chars["α"] = "¢"
assert_equal 'こλにλλ¢η', @chars
@chars["λλ"] = "ααα"
assert_equal 'こλにααα¢η', @chars
end
def test_indexed_insert_should_raise_on_index_overflow
before = @chars.to_s
assert_raise(IndexError) { @chars[10] = 'a' }
assert_raise(IndexError) { @chars[10, 4] = 'a' }
assert_raise(IndexError) { @chars[/ii/] = 'a' }
assert_raise(IndexError) { @chars[/()/, 10] = 'a' }
assert_equal before, @chars
end
def test_indexed_insert_should_raise_on_range_overflow
before = @chars.to_s
assert_raise(RangeError) { @chars[10..12] = 'a' }
assert_equal before, @chars
end
def test_rjust_should_raise_argument_errors_on_bad_arguments
assert_raise(ArgumentError) { @chars.rjust(10, '') }
assert_raise(ArgumentError) { @chars.rjust }
end
def test_rjust_should_count_characters_instead_of_bytes
assert_equal UNICODE_STRING, @chars.rjust(-3)
assert_equal UNICODE_STRING, @chars.rjust(0)
assert_equal UNICODE_STRING, @chars.rjust(4)
assert_equal " #{UNICODE_STRING}", @chars.rjust(5)
assert_equal " #{UNICODE_STRING}", @chars.rjust(7)
assert_equal "---#{UNICODE_STRING}", @chars.rjust(7, '-')
assert_equal "ααα#{UNICODE_STRING}", @chars.rjust(7, 'α')
assert_equal "aba#{UNICODE_STRING}", @chars.rjust(7, 'ab')
assert_equal "αηα#{UNICODE_STRING}", @chars.rjust(7, 'αη')
assert_equal "αηαη#{UNICODE_STRING}", @chars.rjust(8, 'αη')
end
def test_ljust_should_raise_argument_errors_on_bad_arguments
assert_raise(ArgumentError) { @chars.ljust(10, '') }
assert_raise(ArgumentError) { @chars.ljust }
end
def test_ljust_should_count_characters_instead_of_bytes
assert_equal UNICODE_STRING, @chars.ljust(-3)
assert_equal UNICODE_STRING, @chars.ljust(0)
assert_equal UNICODE_STRING, @chars.ljust(4)
assert_equal "#{UNICODE_STRING} ", @chars.ljust(5)
assert_equal "#{UNICODE_STRING} ", @chars.ljust(7)
assert_equal "#{UNICODE_STRING}---", @chars.ljust(7, '-')
assert_equal "#{UNICODE_STRING}ααα", @chars.ljust(7, 'α')
assert_equal "#{UNICODE_STRING}aba", @chars.ljust(7, 'ab')
assert_equal "#{UNICODE_STRING}αηα", @chars.ljust(7, 'αη')
assert_equal "#{UNICODE_STRING}αηαη", @chars.ljust(8, 'αη')
end
def test_center_should_raise_argument_errors_on_bad_arguments
assert_raise(ArgumentError) { @chars.center(10, '') }
assert_raise(ArgumentError) { @chars.center }
end
def test_center_should_count_characters_instead_of_bytes
assert_equal UNICODE_STRING, @chars.center(-3)
assert_equal UNICODE_STRING, @chars.center(0)
assert_equal UNICODE_STRING, @chars.center(4)
assert_equal "#{UNICODE_STRING} ", @chars.center(5)
assert_equal " #{UNICODE_STRING} ", @chars.center(6)
assert_equal " #{UNICODE_STRING} ", @chars.center(7)
assert_equal "--#{UNICODE_STRING}--", @chars.center(8, '-')
assert_equal "--#{UNICODE_STRING}---", @chars.center(9, '-')
assert_equal "αα#{UNICODE_STRING}αα", @chars.center(8, 'α')
assert_equal "αα#{UNICODE_STRING}ααα", @chars.center(9, 'α')
assert_equal "a#{UNICODE_STRING}ab", @chars.center(7, 'ab')
assert_equal "ab#{UNICODE_STRING}ab", @chars.center(8, 'ab')
assert_equal "abab#{UNICODE_STRING}abab", @chars.center(12, 'ab')
assert_equal "α#{UNICODE_STRING}αη", @chars.center(7, 'αη')
assert_equal "αη#{UNICODE_STRING}αη", @chars.center(8, 'αη')
end
def test_lstrip_strips_whitespace_from_the_left_of_the_string
assert_equal UNICODE_STRING, UNICODE_STRING.mb_chars.lstrip
assert_equal UNICODE_STRING, (@whitespace + UNICODE_STRING).mb_chars.lstrip
assert_equal UNICODE_STRING + @whitespace, (@whitespace + UNICODE_STRING + @whitespace).mb_chars.lstrip
end
def test_rstrip_strips_whitespace_from_the_right_of_the_string
assert_equal UNICODE_STRING, UNICODE_STRING.mb_chars.rstrip
assert_equal UNICODE_STRING, (UNICODE_STRING + @whitespace).mb_chars.rstrip
assert_equal @whitespace + UNICODE_STRING, (@whitespace + UNICODE_STRING + @whitespace).mb_chars.rstrip
end
def test_strip_strips_whitespace
assert_equal UNICODE_STRING, UNICODE_STRING.mb_chars.strip
assert_equal UNICODE_STRING, (@whitespace + UNICODE_STRING).mb_chars.strip
assert_equal UNICODE_STRING, (UNICODE_STRING + @whitespace).mb_chars.strip
assert_equal UNICODE_STRING, (@whitespace + UNICODE_STRING + @whitespace).mb_chars.strip
end
def test_stripping_whitespace_leaves_whitespace_within_the_string_intact
string_with_whitespace = UNICODE_STRING + @whitespace + UNICODE_STRING
assert_equal string_with_whitespace, string_with_whitespace.mb_chars.strip
assert_equal string_with_whitespace, string_with_whitespace.mb_chars.lstrip
assert_equal string_with_whitespace, string_with_whitespace.mb_chars.rstrip
end
def test_size_returns_characters_instead_of_bytes
assert_equal 0, ''.mb_chars.size
assert_equal 4, @chars.size
assert_equal 4, @chars.length
assert_equal 5, ASCII_STRING.mb_chars.size
end
def test_reverse_reverses_characters
assert_equal '', ''.mb_chars.reverse
assert_equal 'わちにこ', @chars.reverse
end
def test_reverse_should_work_with_normalized_strings
str = 'bös'
reversed_str = 'söb'
assert_equal chars(reversed_str).normalize(:kc), chars(str).normalize(:kc).reverse
assert_equal chars(reversed_str).normalize(:c), chars(str).normalize(:c).reverse
assert_equal chars(reversed_str).normalize(:d), chars(str).normalize(:d).reverse
assert_equal chars(reversed_str).normalize(:kd), chars(str).normalize(:kd).reverse
assert_equal chars(reversed_str).decompose, chars(str).decompose.reverse
assert_equal chars(reversed_str).compose, chars(str).compose.reverse
end
def test_slice_should_take_character_offsets
assert_equal nil, ''.mb_chars.slice(0)
assert_equal 'こ', @chars.slice(0)
assert_equal 'わ', @chars.slice(3)
assert_equal nil, ''.mb_chars.slice(-1..1)
assert_equal nil, ''.mb_chars.slice(-1, 1)
assert_equal '', ''.mb_chars.slice(0..10)
assert_equal 'にちわ', @chars.slice(1..3)
assert_equal 'にちわ', @chars.slice(1, 3)
assert_equal 'こ', @chars.slice(0, 1)
assert_equal 'ちわ', @chars.slice(2..10)
assert_equal '', @chars.slice(4..10)
assert_equal 'に', @chars.slice(/に/u)
assert_equal 'にち', @chars.slice(/に./u)
assert_equal nil, @chars.slice(/unknown/u)
assert_equal 'にち', @chars.slice(/(にち)/u, 1)
assert_equal nil, @chars.slice(/(にち)/u, 2)
assert_equal nil, @chars.slice(7..6)
end
def test_slice_bang_returns_sliced_out_substring
assert_equal 'にち', @chars.slice!(1..2)
end
def test_slice_bang_removes_the_slice_from_the_receiver
chars = 'úüù'.mb_chars
chars.slice!(0,2)
assert_equal 'ù', chars
end
def test_slice_should_throw_exceptions_on_invalid_arguments
assert_raise(TypeError) { @chars.slice(2..3, 1) }
assert_raise(TypeError) { @chars.slice(1, 2..3) }
assert_raise(ArgumentError) { @chars.slice(1, 1, 1) }
end
def test_ord_should_return_unicode_value_for_first_character
assert_equal 12371, @chars.ord
end
def test_upcase_should_upcase_ascii_characters
assert_equal '', ''.mb_chars.upcase
assert_equal 'ABC', 'aBc'.mb_chars.upcase
end
def test_downcase_should_downcase_ascii_characters
assert_equal '', ''.mb_chars.downcase
assert_equal 'abc', 'aBc'.mb_chars.downcase
end
def test_capitalize_should_work_on_ascii_characters
assert_equal '', ''.mb_chars.capitalize
assert_equal 'Abc', 'abc'.mb_chars.capitalize
end
def test_titleize_should_work_on_ascii_characters
assert_equal '', ''.mb_chars.titleize
assert_equal 'Abc Abc', 'abc abc'.mb_chars.titleize
end
def test_respond_to_knows_which_methods_the_proxy_responds_to
assert ''.mb_chars.respond_to?(:slice) # Defined on Chars
assert ''.mb_chars.respond_to?(:capitalize!) # Defined on Chars
assert ''.mb_chars.respond_to?(:gsub) # Defined on String
assert !''.mb_chars.respond_to?(:undefined_method) # Not defined
end
def test_acts_like_string
assert 'Bambi'.mb_chars.acts_like_string?
end
end
# The default Multibyte Chars proxy has more features than the normal string implementation. Tests
# for the implementation of these features should run on all Ruby versions and shouldn't be tested
# through the proxy methods.
class MultibyteCharsExtrasTest < Test::Unit::TestCase
include MultibyteTestHelpers
def test_upcase_should_be_unicode_aware
assert_equal "АБВГД\0F", chars("аБвгд\0f").upcase
assert_equal 'こにちわ', chars('こにちわ').upcase
end
def test_downcase_should_be_unicode_aware
assert_equal "абвгд\0f", chars("аБвгд\0f").downcase
assert_equal 'こにちわ', chars('こにちわ').downcase
end
def test_capitalize_should_be_unicode_aware
{ 'аБвг аБвг' => 'Абвг абвг',
'аБвг АБВГ' => 'Абвг абвг',
'АБВГ АБВГ' => 'Абвг абвг',
'' => '' }.each do |f,t|
assert_equal t, chars(f).capitalize
end
end
def test_titleize_should_be_unicode_aware
assert_equal "Él Que Se Enteró", chars("ÉL QUE SE ENTERÓ").titleize
assert_equal "Абвг Абвг", chars("аБвг аБвг").titleize
end
def test_titleize_should_not_affect_characters_that_do_not_case_fold
assert_equal "日本語", chars("日本語").titleize
end
def test_limit_should_not_break_on_blank_strings
example = chars('')
assert_equal example, example.limit(0)
assert_equal example, example.limit(1)
end
def test_limit_should_work_on_a_multibyte_string
example = chars(UNICODE_STRING)
bytesize = UNICODE_STRING.bytesize
assert_equal UNICODE_STRING, example.limit(bytesize)
assert_equal '', example.limit(0)
assert_equal '', example.limit(1)
assert_equal 'こ', example.limit(3)
assert_equal 'こに', example.limit(6)
assert_equal 'こに', example.limit(8)
assert_equal 'こにち', example.limit(9)
assert_equal 'こにちわ', example.limit(50)
end
def test_limit_should_work_on_an_ascii_string
ascii = chars(ASCII_STRING)
assert_equal ASCII_STRING, ascii.limit(ASCII_STRING.length)
assert_equal '', ascii.limit(0)
assert_equal 'o', ascii.limit(1)
assert_equal 'oh', ascii.limit(2)
assert_equal 'ohay', ascii.limit(4)
assert_equal 'ohayo', ascii.limit(50)
end
def test_limit_should_keep_under_the_specified_byte_limit
example = chars(UNICODE_STRING)
(1..UNICODE_STRING.length).each do |limit|
assert example.limit(limit).to_s.length <= limit
end
end
def test_composition_exclusion_is_set_up_properly
# Normalization of DEVANAGARI LETTER QA breaks when composition exclusion isn't used correctly
qa = [0x915, 0x93c].pack('U*')
assert_equal qa, chars(qa).normalize(:c)
end
# Test for the Public Review Issue #29, bad explanation of composition might lead to a
# bad implementation: http://www.unicode.org/review/pr-29.html
def test_normalization_C_pri_29
[
[0x0B47, 0x0300, 0x0B3E],
[0x1100, 0x0300, 0x1161]
].map { |c| c.pack('U*') }.each do |c|
assert_equal_codepoints c, chars(c).normalize(:c)
end
end
def test_normalization_shouldnt_strip_null_bytes
null_byte_str = "Test\0test"
assert_equal null_byte_str, chars(null_byte_str).normalize(:kc)
assert_equal null_byte_str, chars(null_byte_str).normalize(:c)
assert_equal null_byte_str, chars(null_byte_str).normalize(:d)
assert_equal null_byte_str, chars(null_byte_str).normalize(:kd)
assert_equal null_byte_str, chars(null_byte_str).decompose
assert_equal null_byte_str, chars(null_byte_str).compose
end
def test_simple_normalization
comp_str = [
44, # LATIN CAPITAL LETTER D
307, # COMBINING DOT ABOVE
328, # COMBINING OGONEK
323 # COMBINING DOT BELOW
].pack("U*")
assert_equal_codepoints '', chars('').normalize
assert_equal_codepoints [44,105,106,328,323].pack("U*"), chars(comp_str).normalize(:kc).to_s
assert_equal_codepoints [44,307,328,323].pack("U*"), chars(comp_str).normalize(:c).to_s
assert_equal_codepoints [44,307,110,780,78,769].pack("U*"), chars(comp_str).normalize(:d).to_s
assert_equal_codepoints [44,105,106,110,780,78,769].pack("U*"), chars(comp_str).normalize(:kd).to_s
end
def test_should_compute_grapheme_length
[
['', 0],
['abc', 3],
['こにちわ', 4],
[[0x0924, 0x094D, 0x0930].pack('U*'), 2],
[%w(cr lf), 1],
[%w(l l), 1],
[%w(l v), 1],
[%w(l lv), 1],
[%w(l lvt), 1],
[%w(lv v), 1],
[%w(lv t), 1],
[%w(v v), 1],
[%w(v t), 1],
[%w(lvt t), 1],
[%w(t t), 1],
[%w(n extend), 1],
[%w(n n), 2],
[%w(n cr lf n), 3],
[%w(n l v t), 2]
].each do |input, expected_length|
if input.kind_of?(Array)
str = string_from_classes(input)
else
str = input
end
assert_equal expected_length, chars(str).g_length
end
end
def test_tidy_bytes_should_tidy_bytes
single_byte_cases = {
"\x21" => "!", # Valid ASCII byte, low
"\x41" => "A", # Valid ASCII byte, mid
"\x7E" => "~", # Valid ASCII byte, high
"\x80" => "€", # Continuation byte, low (cp125)
"\x94" => "”", # Continuation byte, mid (cp125)
"\x9F" => "Ÿ", # Continuation byte, high (cp125)
"\xC0" => "À", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
"\xC1" => "Á", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
"\xC2" => "Â", # Start of 2-byte sequence, low
"\xC8" => "È", # Start of 2-byte sequence, mid
"\xDF" => "ß", # Start of 2-byte sequence, high
"\xE0" => "à", # Start of 3-byte sequence, low
"\xE8" => "è", # Start of 3-byte sequence, mid
"\xEF" => "ï", # Start of 3-byte sequence, high
"\xF0" => "ð", # Start of 4-byte sequence
"\xF1" => "ñ", # Unused byte
"\xFF" => "ÿ", # Restricted byte
"\x00" => "\x00" # null char
}
single_byte_cases.each do |bad, good|
assert_equal good, chars(bad).tidy_bytes.to_s
assert_equal "#{good}#{good}", chars("#{bad}#{bad}").tidy_bytes
assert_equal "#{good}#{good}#{good}", chars("#{bad}#{bad}#{bad}").tidy_bytes
assert_equal "#{good}a", chars("#{bad}a").tidy_bytes
assert_equal "#{good}á", chars("#{bad}á").tidy_bytes
assert_equal "a#{good}a", chars("a#{bad}a").tidy_bytes
assert_equal "á#{good}á", chars("á#{bad}á").tidy_bytes
assert_equal "a#{good}", chars("a#{bad}").tidy_bytes
assert_equal "á#{good}", chars("á#{bad}").tidy_bytes
end
byte_string = "\270\236\010\210\245"
tidy_string = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].pack('U*')
assert_equal_codepoints tidy_string, chars(byte_string).tidy_bytes
assert_nothing_raised { chars(byte_string).tidy_bytes.to_s.unpack('U*') }
# UTF-8 leading byte followed by too few continuation bytes
assert_equal_codepoints "\xc3\xb0\xc2\xa5\xc2\xa4\x21", chars("\xf0\xa5\xa4\x21").tidy_bytes
end
def test_tidy_bytes_should_forcibly_tidy_bytes_if_specified
byte_string = "\xF0\xA5\xA4\xA4" # valid as both CP-1252 and UTF-8, but with different interpretations.
assert_not_equal "𥤤", chars(byte_string).tidy_bytes
# Forcible conversion to UTF-8
assert_equal "𥤤", chars(byte_string).tidy_bytes(true)
end
private
def string_from_classes(classes)
# Characters from the character classes as described in UAX #29
character_from_class = {
:l => 0x1100, :v => 0x1160, :t => 0x11A8, :lv => 0xAC00, :lvt => 0xAC01, :cr => 0x000D, :lf => 0x000A,
:extend => 0x094D, :n => 0x64
}
classes.collect do |k|
character_from_class[k.intern]
end.pack('U*')
end
end
class MultibyteInternalsTest < ActiveSupport::TestCase
include MultibyteTestHelpers
test "Chars translates a character offset to a byte offset" do
example = chars("Puisque c'était son erreur, il m'a aidé")
[
[0, 0],
[3, 3],
[12, 11],
[14, 13],
[41, 39]
].each do |byte_offset, character_offset|
assert_equal character_offset, example.send(:translate_offset, byte_offset),
"Expected byte offset #{byte_offset} to translate to #{character_offset}"
end
end
end
|