4 files changed, 239 insertions, 18 deletions
diff --git a/activesupport/lib/active_support/multibyte.rb b/activesupport/lib/active_support/multibyte.rb
index d8d58f3bce..f59285daba 100644
--- a/activesupport/lib/active_support/multibyte.rb
+++ b/activesupport/lib/active_support/multibyte.rb
@@ -29,7 +29,35 @@ module ActiveSupport #:nodoc:
     #
     # Example:
     #   ActiveSupport::Multibyte.proxy_class = CharsForUTF32
-    mattr_accessor :proxy_class
-    self.proxy_class = ActiveSupport::Multibyte::Chars
+    def self.proxy_class=(klass)
+      @proxy_class = klass
+    end
+
+    # Returns the currect proxy class
+    def self.proxy_class
+      @proxy_class ||= ActiveSupport::Multibyte::Chars
+    end
+
+    # Regular expressions that describe valid byte sequences for a character
+    VALID_CHARACTER = {
+      # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
+      'UTF-8' => /\A(?:
+                  [\x00-\x7f]                                         |
+                  [\xc2-\xdf] [\x80-\xbf]                             |
+                  \xe0        [\xa0-\xbf] [\x80-\xbf]                 |
+                  [\xe1-\xef] [\x80-\xbf] [\x80-\xbf]                 |
+                  \xf0        [\x90-\xbf] [\x80-\xbf] [\x80-\xbf]     |
+                  [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf]     |
+                  \xf4        [\x80-\x8f] [\x80-\xbf] [\x80-\xbf])\z /xn,
+      # Quick check for valid Shift-JIS characters, disregards the odd-even pairing
+      'Shift_JIS' => /\A(?:
+                  [\x00-\x7e \xa1-\xdf]                                     |
+                  [\x81-\x9f \xe0-\xef] [\x40-\x7e \x80-\x9e \x9f-\xfc])\z /xn
+    }
   end
 end
+
+require 'active_support/multibyte/chars'
+require 'active_support/multibyte/exceptions'
+require 'active_support/multibyte/unicode_database'
+require 'active_support/multibyte/utils'
diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb
index 64a35dca40..579ccc124d 100644
--- a/activesupport/lib/active_support/multibyte/chars.rb
+++ b/activesupport/lib/active_support/multibyte/chars.rb
@@ -74,16 +74,7 @@ module ActiveSupport #:nodoc:
       UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/
       UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/
 
-      # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
-      UTF8_PAT = /\A(?:
-                     [\x00-\x7f]                                     |
-                     [\xc2-\xdf] [\x80-\xbf]                         |
-                     \xe0        [\xa0-\xbf] [\x80-\xbf]             |
-                     [\xe1-\xef] [\x80-\xbf] [\x80-\xbf]             |
-                     \xf0        [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
-                     [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
-                     \xf4        [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
-                    )*\z/xn
+      UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8']
 
       attr_reader :wrapped_string
       alias to_s wrapped_string
@@ -308,23 +299,23 @@ module ActiveSupport #:nodoc:
       def rstrip
         chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, ''))
       end
-      
+
       # Strips entire range of Unicode whitespace from the left of the string.
       def lstrip
         chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, ''))
       end
-      
+
       # Strips entire range of Unicode whitespace from the right and left of the string.
       def strip
         rstrip.lstrip
       end
-      
+
       # Returns the number of codepoints in the string
       def size
         self.class.u_unpack(@wrapped_string).size
       end
       alias_method :length, :size
-      
+
       # Reverses all characters in the string.
       #
       # Example:
@@ -332,7 +323,7 @@ module ActiveSupport #:nodoc:
       def reverse
         chars(self.class.u_unpack(@wrapped_string).reverse.pack('U*'))
       end
-      
+
       # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
       # character.
       #
@@ -647,7 +638,7 @@ module ActiveSupport #:nodoc:
           string.split(//u).map do |c|
             c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding)
 
-            if !UTF8_PAT.match(c)
+            if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c)
               n = c.unpack('C')[0]
               n < 128 ? n.chr :
               n < 160 ? [UCD.cp1252[n] || n].pack('U') :
diff --git a/activesupport/lib/active_support/multibyte/utils.rb b/activesupport/lib/active_support/multibyte/utils.rb
new file mode 100644
index 0000000000..acef84da91
--- /dev/null
+++ b/activesupport/lib/active_support/multibyte/utils.rb
@@ -0,0 +1,61 @@
+# encoding: utf-8
+
+module ActiveSupport #:nodoc:
+  module Multibyte #:nodoc:
+    if Kernel.const_defined?(:Encoding)
+      # Returns a regular expression that matches valid characters in the current encoding
+      def self.valid_character
+        VALID_CHARACTER[Encoding.default_internal.to_s]
+      end
+    else
+      def self.valid_character
+        case $KCODE
+        when 'UTF8'
+          VALID_CHARACTER['UTF-8']
+        when 'SJIS'
+          VALID_CHARACTER['Shift_JIS']
+        end
+      end
+    end
+
+    if 'string'.respond_to?(:valid_encoding?)
+      # Verifies the encoding of a string
+      def self.verify(string)
+        string.valid_encoding?
+      end
+    else
+      def self.verify(string)
+        if expression = valid_character
+          for c in string.split(//)
+            return false unless valid_character.match(c)
+          end
+        end
+        true
+      end
+    end
+
+    # Verifies the encoding of the string and raises an exception when it's not valid
+    def self.verify!(string)
+      raise EncodingError.new("Found characters with invalid encoding") unless verify(string)
+    end
+
+    if 'string'.respond_to?(:force_encoding)
+      # Removes all invalid characters from the string.
+      #
+      # Note: this method is a no-op in Ruby 1.9
+      def self.clean(string)
+        string
+      end
+    else
+      def self.clean(string)
+        if expression = valid_character
+          stripped = []; for c in string.split(//)
+            stripped << c if valid_character.match(c)
+          end; stripped.join
+        else
+          string
+        end
+      end
+    end
+  end
+end
+\ No newline at end of file
diff --git a/activesupport/test/multibyte_utils_test.rb b/activesupport/test/multibyte_utils_test.rb
new file mode 100644
index 0000000000..d8ac5ff139
--- /dev/null
+++ b/activesupport/test/multibyte_utils_test.rb
@@ -0,0 +1,141 @@
+# encoding: utf-8
+
+require 'abstract_unit'
+require 'multibyte_test_helpers'
+
+class MultibyteUtilsTest < ActiveSupport::TestCase
+  include MultibyteTestHelpers
+
+  test "valid_character returns an expression for the current encoding" do
+    with_encoding('None') do
+      assert_nil ActiveSupport::Multibyte.valid_character
+    end
+    with_encoding('UTF8') do
+      assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'], ActiveSupport::Multibyte.valid_character
+    end
+    with_encoding('SJIS') do
+      assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['Shift_JIS'], ActiveSupport::Multibyte.valid_character
+    end
+  end
+
+  test "verify verifies ASCII strings are properly encoded" do
+    with_encoding('None') do
+      examples.each do |example|
+        assert ActiveSupport::Multibyte.verify(example)
+      end
+    end
+  end
+
+  test "verify verifies UTF-8 strings are properly encoded" do
+    with_encoding('UTF8') do
+      assert ActiveSupport::Multibyte.verify(example('valid UTF-8'))
+      assert !ActiveSupport::Multibyte.verify(example('invalid UTF-8'))
+    end
+  end
+
+  test "verify verifies Shift-JIS strings are properly encoded" do
+    with_encoding('SJIS') do
+      assert ActiveSupport::Multibyte.verify(example('valid Shift-JIS'))
+      assert !ActiveSupport::Multibyte.verify(example('invalid Shift-JIS'))
+    end
+  end
+
+  test "verify! raises an exception when it finds an invalid character" do
+    with_encoding('UTF8') do
+      assert_raises(ActiveSupport::Multibyte::EncodingError) do
+        ActiveSupport::Multibyte.verify!(example('invalid UTF-8'))
+      end
+    end
+  end
+
+  test "verify! doesn't raise an exception when the encoding is valid" do
+    with_encoding('UTF8') do
+      assert_nothing_raised do
+        ActiveSupport::Multibyte.verify!(example('valid UTF-8'))
+      end
+    end
+  end
+
+  if RUBY_VERSION < '1.9'
+    test "clean leaves ASCII strings intact" do
+      with_encoding('None') do
+        [
+          'word', "\270\236\010\210\245"
+        ].each do |string|
+          assert_equal string, ActiveSupport::Multibyte.clean(string)
+        end
+      end
+    end
+
+    test "clean cleans invalid characters from UTF-8 encoded strings" do
+      with_encoding('UTF8') do
+        cleaned_utf8 = [8].pack('C*')
+        assert_equal example('valid UTF-8'), ActiveSupport::Multibyte.clean(example('valid UTF-8'))
+        assert_equal cleaned_utf8, ActiveSupport::Multibyte.clean(example('invalid UTF-8'))
+      end
+    end
+
+    test "clean cleans invalid characters from Shift-JIS encoded strings" do
+      with_encoding('SJIS') do
+        cleaned_sjis = [184, 0, 136, 165].pack('C*')
+        assert_equal example('valid Shift-JIS'), ActiveSupport::Multibyte.clean(example('valid Shift-JIS'))
+        assert_equal cleaned_sjis, ActiveSupport::Multibyte.clean(example('invalid Shift-JIS'))
+      end
+    end
+  else
+    test "clean is a no-op" do
+      with_encoding('UTF8') do
+        assert_equal example('invalid Shift-JIS'), ActiveSupport::Multibyte.clean(example('invalid Shift-JIS'))
+      end
+    end
+  end
+
+  private
+
+  STRINGS = {
+    'valid ASCII'       => [65, 83, 67, 73, 73].pack('C*'),
+    'invalid ASCII'     => [128].pack('C*'),
+    'valid UTF-8'       => [227, 129, 147, 227, 129, 171, 227, 129, 161, 227, 130, 143].pack('C*'),
+    'invalid UTF-8'     => [184, 158, 8, 136, 165].pack('C*'),
+    'valid Shift-JIS'   => [131, 122, 129, 91, 131, 128].pack('C*'),
+    'invalid Shift-JIS' => [184, 158, 8, 0, 255, 136, 165].pack('C*')
+  }
+
+  if Kernel.const_defined?(:Encoding)
+    def example(key)
+      STRINGS[key].force_encoding(Encoding.default_internal)
+    end
+
+    def examples
+      STRINGS.values.map { |s| s.force_encoding(Encoding.default_internal) }
+    end
+  else
+    def example(key)
+      STRINGS[key]
+    end
+
+    def examples
+      STRINGS.values
+    end
+  end
+
+  if 'string'.respond_to?(:encoding)
+    def with_encoding(enc)
+      before = Encoding.default_internal
+
+      case enc
+      when 'UTF8'
+        Encoding.default_internal = Encoding::UTF_8
+      when 'SJIS'
+        Encoding.default_internal = Encoding::Shift_JIS
+      else
+        Encoding.default_internal = Encoding::BINARY
+      end
+      yield
+
+      Encoding.default_internal = before
+    end
+  else
+    alias with_encoding with_kcode
+  end
+end
+\ No newline at end of file