3 files changed, 565 insertions, 0 deletions
diff --git a/activesupport/test/multibyte_chars_test.rb b/activesupport/test/multibyte_chars_test.rb
new file mode 100644
index 0000000000..e5ad9d26ee
--- /dev/null
+++ b/activesupport/test/multibyte_chars_test.rb
@@ -0,0 +1,163 @@
+require File.dirname(__FILE__) + '/abstract_unit'
+
+$KCODE = 'UTF8'
+
+class CharsTest < Test::Unit::TestCase
+  
+  def setup
+    @s = {
+      :utf8 => "Abcd Блå ﬃ блa  埋",
+      :ascii => "asci ias c iia s",
+      :bytes => "\270\236\010\210\245"
+    }
+  end
+  
+  def test_sanity
+    @s.each do |t, s|
+      assert s.respond_to?(:chars), "All string should have the chars method (#{t})"
+      assert s.respond_to?(:to_s), "All string should have the to_s method (#{t})"
+      assert_kind_of ActiveSupport::Multibyte::Chars, s.chars, "#chars should return an instance of Chars (#{t})"
+    end
+  end
+  
+  def test_comparability
+    @s.each do |t, s|
+      assert_equal s, s.chars.to_s, "Chars#to_s should return enclosed string unchanged"
+    end
+    assert_nothing_raised do
+      assert_equal "a", "a", "Normal string comparisons should be unaffected"
+      assert_not_equal "a", "b", "Normal string comparisons should be unaffected"
+      assert_not_equal "a".chars, "b".chars, "Chars objects should be comparable"
+      assert_equal "a".chars, "A".downcase.chars, "Chars objects should be comparable to each other"
+      assert_equal "a".chars, "A".downcase, "Chars objects should be comparable to strings coming from elsewhere"
+    end
+    
+    assert !@s[:utf8].eql?(@s[:utf8].chars), "Strict comparison is not supported"
+    assert_equal @s[:utf8], @s[:utf8].chars, "Chars should be compared by their enclosed string"
+
+    other_string = @s[:utf8].dup
+    assert_equal other_string, @s[:utf8].chars, "Chars should be compared by their enclosed string"
+    assert_equal other_string.chars, @s[:utf8].chars, "Chars should be compared by their enclosed string"
+    
+    strings = ['builder'.chars, 'armor'.chars, 'zebra'.chars]
+    strings.sort!
+    assert_equal ['armor', 'builder', 'zebra'], strings, "Chars should be sortable based on their enclosed string"
+
+    # This leads to a StackLevelTooDeep exception if the comparison is not wired properly
+    assert_raise(NameError) do
+      Chars
+    end
+  end
+  
+  def test_utf8?
+    assert @s[:utf8].is_utf8?, "UTF-8 strings are UTF-8"
+    assert @s[:ascii].is_utf8?, "All ASCII strings are also valid UTF-8"
+    assert !@s[:bytes].is_utf8?, "This bytestring isn't UTF-8"
+  end
+  
+  # The test for the following methods are defined here because they can only be defined on the Chars class for
+  # various reasons 
+  
+  def test_gsub
+    assert_equal 'éxa', 'éda'.chars.gsub(/d/, 'x')
+    with_kcode('none') do
+      assert_equal 'éxa', 'éda'.chars.gsub(/d/, 'x')
+    end
+  end
+  
+  def test_split
+    word = "eﬃcient"
+    chars = ["e", "ﬃ", "c", "i", "e", "n", "t"]
+    assert_equal chars, word.split(//)
+    assert_equal chars, word.chars.split(//)
+    assert_kind_of ActiveSupport::Multibyte::Chars, word.chars.split(//).first, "Split should return Chars instances"
+  end
+  
+  def test_regexp
+    with_kcode('none') do
+      assert_equal 12, (@s[:utf8].chars =~ /ﬃ/),
+        "Regex matching should be bypassed to String"
+    end
+    with_kcode('UTF8') do
+      assert_equal 9, (@s[:utf8].chars =~ /ﬃ/),
+        "Regex matching should be unicode aware"
+    end
+  end
+  
+  def test_pragma
+    with_kcode('UTF8') do
+      assert " ".chars.send(:utf8_pragma?), "UTF8 pragma should be on because KCODE is UTF8"
+    end
+    with_kcode('none') do
+      assert !" ".chars.send(:utf8_pragma?), "UTF8 pragma should be off"
+    end
+  end
+  
+  def test_handler_setting
+    handler = ''.chars.handler
+    
+    ActiveSupport::Multibyte::Chars.handler = :first
+    assert_equal :first, ''.chars.handler
+    ActiveSupport::Multibyte::Chars.handler = :second
+    assert_equal :second, ''.chars.handler
+    assert_raise(NoMethodError) do
+      ''.chars.handler.split
+    end
+    
+    ActiveSupport::Multibyte::Chars.handler = handler
+  end
+  
+  def test_method_chaining
+    assert_kind_of ActiveSupport::Multibyte::Chars, ''.chars.downcase
+    assert_kind_of ActiveSupport::Multibyte::Chars, ''.chars.strip, "Strip should return a Chars object"
+    assert_kind_of ActiveSupport::Multibyte::Chars, ''.chars.downcase.strip, "The Chars object should be " +
+        "forwarded down the call path for chaining"
+    assert_equal 'foo', "  FOO   ".chars.normalize.downcase.strip, "The Chars that results from the " +
+      " operations should be comparable to the string value of the result"
+  end
+  
+  def test_passthrough_on_kcode
+    # The easiest way to check if the passthrough is in place is through #size
+    with_kcode('nonce') do
+      assert_equal 26, @s[:utf8].chars.size
+    end
+    with_kcode('UTF8') do
+      assert_equal 17, @s[:utf8].chars.size
+    end
+  end
+    
+  def test_destructiveness  
+    # Note that we're testing the destructiveness here and not the correct behaviour of the methods
+    str = 'ac'
+    str.chars.insert(1, 'b')
+    assert_equal 'abc', str, 'Insert should be destructive for a string'
+    
+    str = 'ac'
+    str.chars.reverse!
+    assert_equal 'ca', str, 'reverse! should be destructive for a string'
+  end
+  
+  def test_resilience
+    assert_nothing_raised do
+      assert_equal 1, @s[:bytes].chars.size, "There's only one valid utf-8 byte in the string"
+    end
+    assert_nothing_raised do
+      assert_equal "\010", @s[:bytes].chars.reverse, "There's only one valid utf-8 byte in the string"
+    end
+    assert_nothing_raised do
+      @s[:bytes].chars.reverse!
+      assert_equal "\010", @s[:bytes], "There's only one valid utf-8 byte in the string"
+    end
+  end
+  
+  protected
+
+  def with_kcode(kcode)
+    old_kcode, $KCODE = $KCODE, kcode
+    begin
+      yield
+    ensure
+      $KCODE = old_kcode
+    end
+  end
+end
diff --git a/activesupport/test/multibyte_conformance.rb b/activesupport/test/multibyte_conformance.rb
new file mode 100644
index 0000000000..dfd77bd6e1
--- /dev/null
+++ b/activesupport/test/multibyte_conformance.rb
@@ -0,0 +1,141 @@
+require File.dirname(__FILE__) + '/abstract_unit'
+require 'open-uri'
+
+$KCODE = 'UTF8'
+UNIDATA_URL = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::UNICODE_VERSION}/ucd"
+UNIDATA_FILE = '/NormalizationTest.txt'
+CACHE_DIR = File.dirname(__FILE__) + '/cache'
+
+class Downloader
+  def self.download(from, to)
+    unless File.exist?(to)
+      $stderr.puts "Downloading #{from} to #{to}"
+      unless File.exists?(File.dirname(to))
+        system "mkdir -p #{File.dirname(to)}"
+      end
+      open(from) do |source|
+        File.open(to, 'w') do |target|
+          source.each_line do |l|
+            target.write l
+          end
+        end
+       end
+     end
+  end
+end
+
+class String
+  # Unicode Inspect returns the codepoints of the string in hex
+  def ui
+    "#{self} " + ("[%s]" % unpack("U*").map{|cp| cp.to_s(16) }.join(' '))
+  end unless ''.respond_to?(:ui)
+end
+
+Dir.mkdir(CACHE_DIR) unless File.exists?(CACHE_DIR)
+Downloader.download(UNIDATA_URL + UNIDATA_FILE, CACHE_DIR + UNIDATA_FILE)
+
+module ConformanceTest
+  def test_normalizations_C
+    each_line_of_norm_tests do |*cols|
+      col1, col2, col3, col4, col5, comment = *cols
+      
+      # CONFORMANCE:
+      # 1. The following invariants must be true for all conformant implementations
+      #
+      #    NFC
+      #      c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3)
+      assert_equal col2.ui, @handler.normalize(col1, :c).ui, "Form C - Col 2 has to be NFC(1) - #{comment}"
+      assert_equal col2.ui, @handler.normalize(col2, :c).ui, "Form C - Col 2 has to be NFC(2) - #{comment}"
+      assert_equal col2.ui, @handler.normalize(col3, :c).ui, "Form C - Col 2 has to be NFC(3) - #{comment}"
+      #
+      #      c4 ==  NFC(c4) ==  NFC(c5)
+      assert_equal col4.ui, @handler.normalize(col4, :c).ui, "Form C - Col 4 has to be C(4) - #{comment}"
+      assert_equal col4.ui, @handler.normalize(col5, :c).ui, "Form C - Col 4 has to be C(5) - #{comment}"
+    end
+  end
+  
+  def test_normalizations_D
+    each_line_of_norm_tests do |*cols|
+      col1, col2, col3, col4, col5, comment = *cols
+      #
+      #    NFD
+      #      c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3)
+      assert_equal col3.ui, @handler.normalize(col1, :d).ui, "Form D - Col 3 has to be NFD(1) - #{comment}"
+      assert_equal col3.ui, @handler.normalize(col2, :d).ui, "Form D - Col 3 has to be NFD(2) - #{comment}"
+      assert_equal col3.ui, @handler.normalize(col3, :d).ui, "Form D - Col 3 has to be NFD(3) - #{comment}"
+      #      c5 ==  NFD(c4) ==  NFD(c5)
+      assert_equal col5.ui, @handler.normalize(col4, :d).ui, "Form D - Col 5 has to be NFD(4) - #{comment}"
+      assert_equal col5.ui, @handler.normalize(col5, :d).ui, "Form D - Col 5 has to be NFD(5) - #{comment}"
+    end
+  end
+  
+  def test_normalizations_KC
+    each_line_of_norm_tests do | *cols |
+      col1, col2, col3, col4, col5, comment = *cols  
+      #
+      #    NFKC
+      #      c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
+      assert_equal col4.ui, @handler.normalize(col1, :kc).ui, "Form D - Col 4 has to be NFKC(1) - #{comment}"
+      assert_equal col4.ui, @handler.normalize(col2, :kc).ui, "Form D - Col 4 has to be NFKC(2) - #{comment}"
+      assert_equal col4.ui, @handler.normalize(col3, :kc).ui, "Form D - Col 4 has to be NFKC(3) - #{comment}"
+      assert_equal col4.ui, @handler.normalize(col4, :kc).ui, "Form D - Col 4 has to be NFKC(4) - #{comment}"
+      assert_equal col4.ui, @handler.normalize(col5, :kc).ui, "Form D - Col 4 has to be NFKC(5) - #{comment}"
+    end
+  end
+  
+  def test_normalizations_KD
+    each_line_of_norm_tests do | *cols |
+      col1, col2, col3, col4, col5, comment = *cols  
+      #
+      #    NFKD
+      #      c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
+      assert_equal col5.ui, @handler.normalize(col1, :kd).ui, "Form KD - Col 5 has to be NFKD(1) - #{comment}"
+      assert_equal col5.ui, @handler.normalize(col2, :kd).ui, "Form KD - Col 5 has to be NFKD(2) - #{comment}"
+      assert_equal col5.ui, @handler.normalize(col3, :kd).ui, "Form KD - Col 5 has to be NFKD(3) - #{comment}"
+      assert_equal col5.ui, @handler.normalize(col4, :kd).ui, "Form KD - Col 5 has to be NFKD(4) - #{comment}"
+      assert_equal col5.ui, @handler.normalize(col5, :kd).ui, "Form KD - Col 5 has to be NFKD(5) - #{comment}"
+    end
+  end
+  
+  protected
+    def each_line_of_norm_tests(&block)
+      lines = 0
+      max_test_lines = 0 # Don't limit below 38, because that's the header of the testfile
+      File.open(File.dirname(__FILE__) + '/cache' + UNIDATA_FILE, 'r') do | f |
+        until f.eof? || (max_test_lines > 38 and lines > max_test_lines)
+          lines += 1
+          line = f.gets.chomp!
+          next if (line.empty? || line =~ /^\#/)      
+          
+          cols, comment = line.split("#")
+          cols = cols.split(";").map{|e| e.strip}.reject{|e| e.empty? }
+          next unless cols.length == 5
+          
+          # codepoints are in hex in the test suite, pack wants them as integers
+          cols.map!{|c| c.split.map{|codepoint| codepoint.to_i(16)}.pack("U*") }
+          cols << comment
+          
+          yield(*cols)
+        end
+      end
+    end
+end
+
+begin
+  require_library_or_gem('utf8proc_native')
+  require 'active_record/multibyte/handlers/utf8_handler_proc'
+  class ConformanceTestProc < Test::Unit::TestCase
+    include ConformanceTest
+    def setup
+      @handler = ::ActiveSupport::Multibyte::Handlers::UTF8HandlerProc
+    end
+  end
+rescue LoadError
+end
+
+class ConformanceTestPure < Test::Unit::TestCase
+  include ConformanceTest
+  def setup
+    @handler = ::ActiveSupport::Multibyte::Handlers::UTF8Handler
+  end
+end
+\ No newline at end of file
diff --git a/activesupport/test/multibyte_handler_test.rb b/activesupport/test/multibyte_handler_test.rb
new file mode 100644
index 0000000000..08291c1212
--- /dev/null
+++ b/activesupport/test/multibyte_handler_test.rb
@@ -0,0 +1,261 @@
+require File.dirname(__FILE__) + '/abstract_unit'
+
+$KCODE = 'UTF8'
+
+class String
+  # Unicode Inspect returns the codepoints of the string in hex
+  def ui
+    "#{self} " + ("[%s]" % unpack("U*").map{|cp| cp.to_s(16) }.join(' '))
+  end unless ''.respond_to?(:ui)
+end
+
+module UTF8HandlingTest
+  
+  def common_setup
+    # This is an ASCII string with some russian strings and a ligature. It's nicely calibrated, because
+    # slicing it at some specific bytes will kill your characters if you use standard Ruby routines.
+    # It has both capital and standard letters, so that we can test case conversions easily.
+    # It has 26 charactes and 28 when the ligature gets split during normalization.
+    @string =     "Abcd Блå ﬃ бла бла бла бла"
+    @string_kd =  "Abcd Блå ffi бла бла бла бла"
+    @string_kc =  "Abcd Блå ffi бла бла бла бла"
+    @string_c =   "Abcd Блå ﬃ бла бла бла бла"
+    @string_d =   "Abcd Блå ﬃ бла бла бла бла"
+    @bytestring = "\270\236\010\210\245" # Not UTF-8
+    
+    # Characters from the character classes as described in UAX #29
+    @character_from_class = {
+      :l => 0x1100, :v => 0x1160, :t => 0x11A8, :lv => 0xAC00, :lvt => 0xAC01, :cr => 0x000D, :lf => 0x000A,
+      :extend => 0x094D, :n => 0x64
+    }
+  end
+  
+  def test_utf8_recognition
+    assert ActiveSupport::Multibyte::Handlers::UTF8Handler.consumes?(@string),
+      "Should recognize as a valid UTF-8 string"
+    assert !ActiveSupport::Multibyte::Handlers::UTF8Handler.consumes?(@bytestring), "This is bytestring, not UTF-8"
+  end
+  
+  def test_simple_normalization
+    # Normalization of DEVANAGARI LETTER QA breaks when composition exclusion isn't used correctly
+    assert_equal [0x915, 0x93c].pack('U*').ui, [0x915, 0x93c].pack('U*').chars.normalize(:c).to_s.ui
+    
+    null_byte_str = "Test\0test"
+    
+    assert_equal '', @handler.normalize(''), "Empty string should not break things"
+    assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :kc).ui, "Null byte should remain"
+    assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :c).ui, "Null byte should remain" 
+    assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :d).ui, "Null byte should remain"
+    assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :kd).ui, "Null byte should remain"
+    assert_equal null_byte_str.ui, @handler.decompose(null_byte_str).ui, "Null byte should remain"
+    assert_equal null_byte_str.ui, @handler.compose(null_byte_str).ui, "Null byte should remain" 
+    
+    comp_str = [
+      44,  # LATIN CAPITAL LETTER D
+      307, # COMBINING DOT ABOVE
+      328, # COMBINING OGONEK
+      323 # COMBINING DOT BELOW
+    ].pack("U*")
+    norm_str_KC = [44,105,106,328,323].pack("U*")
+    norm_str_C = [44,307,328,323].pack("U*")
+    norm_str_D = [44,307,110,780,78,769].pack("U*")
+    norm_str_KD = [44,105,106,110,780,78,769].pack("U*")
+    
+    assert_equal norm_str_KC.ui, @handler.normalize(comp_str, :kc).ui, "Should normalize KC"
+    assert_equal norm_str_C.ui, @handler.normalize(comp_str, :c).ui, "Should normalize C"
+    assert_equal norm_str_D.ui, @handler.normalize(comp_str, :d).ui, "Should normalize D"
+    assert_equal norm_str_KD.ui, @handler.normalize(comp_str, :kd).ui, "Should normalize KD"
+    
+    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.normalize(@bytestring) }
+  end
+  
+  # Test for the Public Review Issue #29, bad explaination of composition might lead to a
+  # bad implementation: http://www.unicode.org/review/pr-29.html
+  def test_normalization_C_pri_29
+    [
+      [0x0B47, 0x0300, 0x0B3E],
+      [0x1100, 0x0300, 0x1161]
+    ].map { |c| c.pack('U*') }.each do |c|
+      assert_equal c.ui, @handler.normalize(c, :c).ui, "Composition is implemented incorrectly"
+    end
+  end
+  
+  def test_casefolding
+    simple_str = "abCdef"
+    simple_str_upcase = "ABCDEF"
+    simple_str_downcase = "abcdef"
+    
+    assert_equal '', @handler.downcase(@handler.upcase('')), "Empty string should not break things"
+    assert_equal simple_str_upcase, @handler.upcase(simple_str), "should upcase properly"
+    assert_equal simple_str_downcase, @handler.downcase(simple_str), "should downcase properly"
+    assert_equal simple_str_downcase, @handler.downcase(@handler.upcase(simple_str_downcase)), "upcase and downcase should be mirrors"
+    
+    rus_str = "аБвгд\0f"
+    rus_str_upcase = "АБВГД\0F"
+    rus_str_downcase = "абвгд\0f"
+    assert_equal rus_str_upcase, @handler.upcase(rus_str), "should upcase properly honoring null-byte"
+    assert_equal rus_str_downcase, @handler.downcase(rus_str), "should downcase properly honoring null-byte"
+    
+    jap_str = "の埋め込み化対応はほぼ完成"
+    assert_equal jap_str, @handler.upcase(jap_str), "Japanse has no upcase, should remain unchanged"
+    assert_equal jap_str, @handler.downcase(jap_str), "Japanse has no downcase, should remain unchanged"
+    
+    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.upcase(@bytestring) }
+  end
+  
+  def test_capitalize
+    { 'аБвг аБвг' => 'Абвг абвг',
+      'аБвг АБВГ' => 'Абвг абвг',
+      'АБВГ АБВГ' => 'Абвг абвг',
+      '' => '' }.each do |f,t|
+        assert_equal t, @handler.capitalize(f), "Capitalize should work as expected"
+    end
+    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.capitalize(@bytestring) }
+  end
+  
+  def test_translate_offset
+    str = "Блaå" # [2, 2, 1, 2] bytes
+    assert_equal 0, @handler.translate_offset('', 0), "Offset for an empty string makes no sense, return 0"
+    assert_equal 0, @handler.translate_offset(str, 0), "First character, first byte"
+    assert_equal 0, @handler.translate_offset(str, 1), "First character, second byte"
+    assert_equal 1, @handler.translate_offset(str, 2), "Second character, third byte"
+    assert_equal 1, @handler.translate_offset(str, 3), "Second character, fourth byte"
+    assert_equal 2, @handler.translate_offset(str, 4), "Third character, fifth byte"
+    assert_equal 3, @handler.translate_offset(str, 5), "Fourth character, sixth byte"
+    assert_equal 3, @handler.translate_offset(str, 6), "Fourth character, seventh byte"
+    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.translate_offset(@bytestring, 3) }
+  end
+  
+  def test_insert
+    assert_equal '', @handler.insert('', 0, ''), "Empty string should not break things"
+    assert_equal "Abcd Блå ﬃБУМ бла бла бла бла", @handler.insert(@string, 10, "БУМ"), 
+      "Text should be inserted at right codepoints"
+    assert_equal "Abcd Блå ﬃБУМ бла бла бла бла", @string, "Insert should be destructive"
+    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) do
+      @handler.insert(@bytestring, 2, "\210")
+    end
+  end
+  
+  def test_reverse
+    str = "wБлåa \n"
+    rev = "\n aåлБw"
+    assert_equal '', @handler.reverse(''), "Empty string shouldn't change"
+    assert_equal rev.ui, @handler.reverse(str).ui, "Should reverse properly"
+    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.reverse(@bytestring) }
+  end
+  
+  def test_size
+    assert_equal 0, @handler.size(''), "Empty string has size 0"
+    assert_equal 26, @handler.size(@string), "String length should be 26"
+    assert_equal 26, @handler.length(@string), "String length method should be properly aliased"
+    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.size(@bytestring) }
+  end
+  
+  def test_slice
+    assert_equal 0x41, @handler.slice(@string, 0), "Singular characters should return codepoints"
+    assert_equal 0xE5, @handler.slice(@string, 7), "Singular characters should return codepoints"
+    assert_equal nil, @handler.slice('', -1..1), "Broken range should return nil"
+    assert_equal '', @handler.slice('', 0..10), "Empty string should not break things"
+    assert_equal "d Блå ﬃ", @handler.slice(@string, 3..9), "Unicode characters have to be returned"
+    assert_equal " Блå ﬃ ", @handler.slice(@string, 4..10), "Unicode characters have to be returned"
+    assert_equal "", @handler.slice(@string, 7..6), "Range is empty, should return an empty string"
+    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.slice(@bytestring, 2..3) }
+  end
+  
+  def test_grapheme_cluster_length
+    assert_equal 0, @handler.g_length(''), "String should count 0 grapheme clusters"
+    assert_equal 2, @handler.g_length([0x0924, 0x094D, 0x0930].pack('U*')), "String should count 2 grapheme clusters"
+    assert_equal 1, @handler.g_length(string_from_classes(%w(cr lf))), "Don't cut between CR and LF"
+    assert_equal 1, @handler.g_length(string_from_classes(%w(l l))), "Don't cut between L"
+    assert_equal 1, @handler.g_length(string_from_classes(%w(l v))), "Don't cut between L and V"
+    assert_equal 1, @handler.g_length(string_from_classes(%w(l lv))), "Don't cut between L and LV"
+    assert_equal 1, @handler.g_length(string_from_classes(%w(l lvt))), "Don't cut between L and LVT"
+    assert_equal 1, @handler.g_length(string_from_classes(%w(lv v))), "Don't cut between LV and V"
+    assert_equal 1, @handler.g_length(string_from_classes(%w(lv t))), "Don't cut between LV and T"
+    assert_equal 1, @handler.g_length(string_from_classes(%w(v v))), "Don't cut between V and V"
+    assert_equal 1, @handler.g_length(string_from_classes(%w(v t))), "Don't cut between V and T"
+    assert_equal 1, @handler.g_length(string_from_classes(%w(lvt t))), "Don't cut between LVT and T"
+    assert_equal 1, @handler.g_length(string_from_classes(%w(t t))), "Don't cut between T and T"
+    assert_equal 1, @handler.g_length(string_from_classes(%w(n extend))), "Don't cut before Extend"
+    assert_equal 2, @handler.g_length(string_from_classes(%w(n n))), "Cut between normal characters"
+    assert_equal 3, @handler.g_length(string_from_classes(%w(n cr lf n))), "Don't cut between CR and LF"
+    assert_equal 2, @handler.g_length(string_from_classes(%w(n l v t))), "Don't cut between L, V and T"
+    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.g_length(@bytestring) }
+  end
+  
+  def test_index
+     s = "Καλημέρα κόσμε!"
+     assert_equal 0, @handler.index('', ''), "The empty string is always found at the beginning of the string"
+     assert_equal 0, @handler.index('haystack', ''), "The empty string is always found at the beginning of the string"
+     assert_equal 0, @handler.index(s, 'Κ'), "Greek K is at 0"
+     assert_equal 1, @handler.index(s, 'α'), "Greek Alpha is at 1"
+     
+     assert_equal nil, @handler.index(@bytestring, 'a')
+     assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.index(@bytestring, "\010") }
+  end
+  
+  def test_strip
+    # A unicode aware version of strip should strip all 26 types of whitespace. This includes the NO BREAK SPACE
+    # aka BOM (byte order mark). The byte order mark has no place in UTF-8 because it's used to detect LE and BE.
+    b = "\n" + [
+      32, # SPACE
+      8195, # EM SPACE
+      8199, # FIGURE SPACE,
+      8201, # THIN SPACE
+      8202, # HAIR SPACE
+      65279, # NO BREAK SPACE (ZW)
+    ].pack('U*')
+    m = "word блин\n\n\n  word"
+    e = [
+    65279, # NO BREAK SPACE (ZW)
+    8201, # THIN SPACE
+    8199, # FIGURE SPACE,      
+    32, # SPACE
+    ].pack('U*')
+    string = b+m+e
+    
+    assert_equal '', @handler.strip(''), "Empty string should stay empty"
+    assert_equal m+e, @handler.lstrip(string), "Whitespace should be gone on the left"
+    assert_equal b+m, @handler.rstrip(string), "Whitespace should be gone on the right"
+    assert_equal m, @handler.strip(string), "Whitespace should be stripped on both sides"
+    
+    bs = "\n   #{@bytestring} \n\n"
+    assert_equal @bytestring, @handler.strip(bs), "Invalid unicode strings should still strip"
+  end
+  
+  def test_tidy_bytes
+    assert_equal "\010", @handler.tidy_bytes(@bytestring)
+    assert_equal "a\010a", @handler.tidy_bytes('a' + @bytestring + 'a')
+    assert_nothing_raised { @handler.tidy_bytes(@bytestring).unpack('U*') }
+  end
+  
+  protected
+  
+  def string_from_classes(classes)
+    classes.collect do |k|
+      @character_from_class[k.intern]
+    end.pack('U*')
+  end
+end
+
+
+begin
+  require_library_or_gem('utf8proc_native')
+  require 'active_record/multibyte/handlers/utf8_handler_proc'
+  class UTF8HandlingTestProc < Test::Unit::TestCase
+    include UTF8HandlingTest
+    def setup
+      common_setup
+      @handler = ::ActiveSupport::Multibyte::Handlers::UTF8HandlerProc
+    end
+  end
+rescue LoadError
+end
+
+class UTF8HandlingTestPure < Test::Unit::TestCase
+  include UTF8HandlingTest
+  def setup
+    common_setup
+    @handler = ::ActiveSupport::Multibyte::Handlers::UTF8Handler
+  end
+end
+\ No newline at end of file