aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--activesupport/lib/active_support/multibyte.rb32
-rw-r--r--activesupport/lib/active_support/multibyte/chars.rb23
-rw-r--r--activesupport/lib/active_support/multibyte/utils.rb61
-rw-r--r--activesupport/test/multibyte_utils_test.rb141
4 files changed, 239 insertions, 18 deletions
diff --git a/activesupport/lib/active_support/multibyte.rb b/activesupport/lib/active_support/multibyte.rb
index d8d58f3bce..f59285daba 100644
--- a/activesupport/lib/active_support/multibyte.rb
+++ b/activesupport/lib/active_support/multibyte.rb
@@ -29,7 +29,35 @@ module ActiveSupport #:nodoc:
#
# Example:
# ActiveSupport::Multibyte.proxy_class = CharsForUTF32
- mattr_accessor :proxy_class
- self.proxy_class = ActiveSupport::Multibyte::Chars
+ def self.proxy_class=(klass)
+ @proxy_class = klass
+ end
+
+ # Returns the currect proxy class
+ def self.proxy_class
+ @proxy_class ||= ActiveSupport::Multibyte::Chars
+ end
+
+ # Regular expressions that describe valid byte sequences for a character
+ VALID_CHARACTER = {
+ # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
+ 'UTF-8' => /\A(?:
+ [\x00-\x7f] |
+ [\xc2-\xdf] [\x80-\xbf] |
+ \xe0 [\xa0-\xbf] [\x80-\xbf] |
+ [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
+ \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
+ [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
+ \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf])\z /xn,
+ # Quick check for valid Shift-JIS characters, disregards the odd-even pairing
+ 'Shift_JIS' => /\A(?:
+ [\x00-\x7e \xa1-\xdf] |
+ [\x81-\x9f \xe0-\xef] [\x40-\x7e \x80-\x9e \x9f-\xfc])\z /xn
+ }
end
end
+
+require 'active_support/multibyte/chars'
+require 'active_support/multibyte/exceptions'
+require 'active_support/multibyte/unicode_database'
+require 'active_support/multibyte/utils'
diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb
index 64a35dca40..579ccc124d 100644
--- a/activesupport/lib/active_support/multibyte/chars.rb
+++ b/activesupport/lib/active_support/multibyte/chars.rb
@@ -74,16 +74,7 @@ module ActiveSupport #:nodoc:
UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/
UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/
- # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
- UTF8_PAT = /\A(?:
- [\x00-\x7f] |
- [\xc2-\xdf] [\x80-\xbf] |
- \xe0 [\xa0-\xbf] [\x80-\xbf] |
- [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
- \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
- [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
- \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
- )*\z/xn
+ UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8']
attr_reader :wrapped_string
alias to_s wrapped_string
@@ -308,23 +299,23 @@ module ActiveSupport #:nodoc:
def rstrip
chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, ''))
end
-
+
# Strips entire range of Unicode whitespace from the left of the string.
def lstrip
chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, ''))
end
-
+
# Strips entire range of Unicode whitespace from the right and left of the string.
def strip
rstrip.lstrip
end
-
+
# Returns the number of codepoints in the string
def size
self.class.u_unpack(@wrapped_string).size
end
alias_method :length, :size
-
+
# Reverses all characters in the string.
#
# Example:
@@ -332,7 +323,7 @@ module ActiveSupport #:nodoc:
def reverse
chars(self.class.u_unpack(@wrapped_string).reverse.pack('U*'))
end
-
+
# Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
# character.
#
@@ -647,7 +638,7 @@ module ActiveSupport #:nodoc:
string.split(//u).map do |c|
c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding)
- if !UTF8_PAT.match(c)
+ if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c)
n = c.unpack('C')[0]
n < 128 ? n.chr :
n < 160 ? [UCD.cp1252[n] || n].pack('U') :
diff --git a/activesupport/lib/active_support/multibyte/utils.rb b/activesupport/lib/active_support/multibyte/utils.rb
new file mode 100644
index 0000000000..acef84da91
--- /dev/null
+++ b/activesupport/lib/active_support/multibyte/utils.rb
@@ -0,0 +1,61 @@
+# encoding: utf-8
+
+module ActiveSupport #:nodoc:
+ module Multibyte #:nodoc:
+ if Kernel.const_defined?(:Encoding)
+ # Returns a regular expression that matches valid characters in the current encoding
+ def self.valid_character
+ VALID_CHARACTER[Encoding.default_internal.to_s]
+ end
+ else
+ def self.valid_character
+ case $KCODE
+ when 'UTF8'
+ VALID_CHARACTER['UTF-8']
+ when 'SJIS'
+ VALID_CHARACTER['Shift_JIS']
+ end
+ end
+ end
+
+ if 'string'.respond_to?(:valid_encoding?)
+ # Verifies the encoding of a string
+ def self.verify(string)
+ string.valid_encoding?
+ end
+ else
+ def self.verify(string)
+ if expression = valid_character
+ for c in string.split(//)
+ return false unless valid_character.match(c)
+ end
+ end
+ true
+ end
+ end
+
+ # Verifies the encoding of the string and raises an exception when it's not valid
+ def self.verify!(string)
+ raise EncodingError.new("Found characters with invalid encoding") unless verify(string)
+ end
+
+ if 'string'.respond_to?(:force_encoding)
+ # Removes all invalid characters from the string.
+ #
+ # Note: this method is a no-op in Ruby 1.9
+ def self.clean(string)
+ string
+ end
+ else
+ def self.clean(string)
+ if expression = valid_character
+ stripped = []; for c in string.split(//)
+ stripped << c if valid_character.match(c)
+ end; stripped.join
+ else
+ string
+ end
+ end
+ end
+ end
+end \ No newline at end of file
diff --git a/activesupport/test/multibyte_utils_test.rb b/activesupport/test/multibyte_utils_test.rb
new file mode 100644
index 0000000000..d8ac5ff139
--- /dev/null
+++ b/activesupport/test/multibyte_utils_test.rb
@@ -0,0 +1,141 @@
+# encoding: utf-8
+
+require 'abstract_unit'
+require 'multibyte_test_helpers'
+
+class MultibyteUtilsTest < ActiveSupport::TestCase
+ include MultibyteTestHelpers
+
+ test "valid_character returns an expression for the current encoding" do
+ with_encoding('None') do
+ assert_nil ActiveSupport::Multibyte.valid_character
+ end
+ with_encoding('UTF8') do
+ assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'], ActiveSupport::Multibyte.valid_character
+ end
+ with_encoding('SJIS') do
+ assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['Shift_JIS'], ActiveSupport::Multibyte.valid_character
+ end
+ end
+
+ test "verify verifies ASCII strings are properly encoded" do
+ with_encoding('None') do
+ examples.each do |example|
+ assert ActiveSupport::Multibyte.verify(example)
+ end
+ end
+ end
+
+ test "verify verifies UTF-8 strings are properly encoded" do
+ with_encoding('UTF8') do
+ assert ActiveSupport::Multibyte.verify(example('valid UTF-8'))
+ assert !ActiveSupport::Multibyte.verify(example('invalid UTF-8'))
+ end
+ end
+
+ test "verify verifies Shift-JIS strings are properly encoded" do
+ with_encoding('SJIS') do
+ assert ActiveSupport::Multibyte.verify(example('valid Shift-JIS'))
+ assert !ActiveSupport::Multibyte.verify(example('invalid Shift-JIS'))
+ end
+ end
+
+ test "verify! raises an exception when it finds an invalid character" do
+ with_encoding('UTF8') do
+ assert_raises(ActiveSupport::Multibyte::EncodingError) do
+ ActiveSupport::Multibyte.verify!(example('invalid UTF-8'))
+ end
+ end
+ end
+
+ test "verify! doesn't raise an exception when the encoding is valid" do
+ with_encoding('UTF8') do
+ assert_nothing_raised do
+ ActiveSupport::Multibyte.verify!(example('valid UTF-8'))
+ end
+ end
+ end
+
+ if RUBY_VERSION < '1.9'
+ test "clean leaves ASCII strings intact" do
+ with_encoding('None') do
+ [
+ 'word', "\270\236\010\210\245"
+ ].each do |string|
+ assert_equal string, ActiveSupport::Multibyte.clean(string)
+ end
+ end
+ end
+
+ test "clean cleans invalid characters from UTF-8 encoded strings" do
+ with_encoding('UTF8') do
+ cleaned_utf8 = [8].pack('C*')
+ assert_equal example('valid UTF-8'), ActiveSupport::Multibyte.clean(example('valid UTF-8'))
+ assert_equal cleaned_utf8, ActiveSupport::Multibyte.clean(example('invalid UTF-8'))
+ end
+ end
+
+ test "clean cleans invalid characters from Shift-JIS encoded strings" do
+ with_encoding('SJIS') do
+ cleaned_sjis = [184, 0, 136, 165].pack('C*')
+ assert_equal example('valid Shift-JIS'), ActiveSupport::Multibyte.clean(example('valid Shift-JIS'))
+ assert_equal cleaned_sjis, ActiveSupport::Multibyte.clean(example('invalid Shift-JIS'))
+ end
+ end
+ else
+ test "clean is a no-op" do
+ with_encoding('UTF8') do
+ assert_equal example('invalid Shift-JIS'), ActiveSupport::Multibyte.clean(example('invalid Shift-JIS'))
+ end
+ end
+ end
+
+ private
+
+ STRINGS = {
+ 'valid ASCII' => [65, 83, 67, 73, 73].pack('C*'),
+ 'invalid ASCII' => [128].pack('C*'),
+ 'valid UTF-8' => [227, 129, 147, 227, 129, 171, 227, 129, 161, 227, 130, 143].pack('C*'),
+ 'invalid UTF-8' => [184, 158, 8, 136, 165].pack('C*'),
+ 'valid Shift-JIS' => [131, 122, 129, 91, 131, 128].pack('C*'),
+ 'invalid Shift-JIS' => [184, 158, 8, 0, 255, 136, 165].pack('C*')
+ }
+
+ if Kernel.const_defined?(:Encoding)
+ def example(key)
+ STRINGS[key].force_encoding(Encoding.default_internal)
+ end
+
+ def examples
+ STRINGS.values.map { |s| s.force_encoding(Encoding.default_internal) }
+ end
+ else
+ def example(key)
+ STRINGS[key]
+ end
+
+ def examples
+ STRINGS.values
+ end
+ end
+
+ if 'string'.respond_to?(:encoding)
+ def with_encoding(enc)
+ before = Encoding.default_internal
+
+ case enc
+ when 'UTF8'
+ Encoding.default_internal = Encoding::UTF_8
+ when 'SJIS'
+ Encoding.default_internal = Encoding::Shift_JIS
+ else
+ Encoding.default_internal = Encoding::BINARY
+ end
+ yield
+
+ Encoding.default_internal = before
+ end
+ else
+ alias with_encoding with_kcode
+ end
+end \ No newline at end of file