aboutsummaryrefslogtreecommitdiffstats
path: root/activesupport/lib
diff options
context:
space:
mode:
authorMichael Koziarski <michael@koziarski.com>2009-08-31 12:16:22 -0700
committerMichael Koziarski <michael@koziarski.com>2009-09-04 09:25:38 +1200
commit9a73630d935e360f3dc896e50dd673afb97cf3b5 (patch)
treee404f7dbfc142a10b45b758f0cea23812abc9f23 /activesupport/lib
parent5e6dab8b34152bc48c89032d20e5bda1511e28fb (diff)
downloadrails-9a73630d935e360f3dc896e50dd673afb97cf3b5.tar.gz
rails-9a73630d935e360f3dc896e50dd673afb97cf3b5.tar.bz2
rails-9a73630d935e360f3dc896e50dd673afb97cf3b5.zip
Add verify and clean methods to ActiveSupport::Multibyte.
When accepting character input from outside of your application you can't blindly trust that all strings are properly encoded. With these methods you can check incoming strings and clean them up if necessary. Signed-off-by: Michael Koziarski <michael@koziarski.com> Conflicts: activesupport/lib/active_support/multibyte.rb
Diffstat (limited to 'activesupport/lib')
-rw-r--r--activesupport/lib/active_support/multibyte.rb32
-rw-r--r--activesupport/lib/active_support/multibyte/chars.rb23
-rw-r--r--activesupport/lib/active_support/multibyte/utils.rb61
3 files changed, 98 insertions, 18 deletions
diff --git a/activesupport/lib/active_support/multibyte.rb b/activesupport/lib/active_support/multibyte.rb
index d8d58f3bce..f59285daba 100644
--- a/activesupport/lib/active_support/multibyte.rb
+++ b/activesupport/lib/active_support/multibyte.rb
@@ -29,7 +29,35 @@ module ActiveSupport #:nodoc:
#
# Example:
# ActiveSupport::Multibyte.proxy_class = CharsForUTF32
- mattr_accessor :proxy_class
- self.proxy_class = ActiveSupport::Multibyte::Chars
+ def self.proxy_class=(klass)
+ @proxy_class = klass
+ end
+
+ # Returns the currect proxy class
+ def self.proxy_class
+ @proxy_class ||= ActiveSupport::Multibyte::Chars
+ end
+
+ # Regular expressions that describe valid byte sequences for a character
+ VALID_CHARACTER = {
+ # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
+ 'UTF-8' => /\A(?:
+ [\x00-\x7f] |
+ [\xc2-\xdf] [\x80-\xbf] |
+ \xe0 [\xa0-\xbf] [\x80-\xbf] |
+ [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
+ \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
+ [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
+ \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf])\z /xn,
+ # Quick check for valid Shift-JIS characters, disregards the odd-even pairing
+ 'Shift_JIS' => /\A(?:
+ [\x00-\x7e \xa1-\xdf] |
+ [\x81-\x9f \xe0-\xef] [\x40-\x7e \x80-\x9e \x9f-\xfc])\z /xn
+ }
end
end
+
+require 'active_support/multibyte/chars'
+require 'active_support/multibyte/exceptions'
+require 'active_support/multibyte/unicode_database'
+require 'active_support/multibyte/utils'
diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb
index 64a35dca40..579ccc124d 100644
--- a/activesupport/lib/active_support/multibyte/chars.rb
+++ b/activesupport/lib/active_support/multibyte/chars.rb
@@ -74,16 +74,7 @@ module ActiveSupport #:nodoc:
UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/
UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/
- # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
- UTF8_PAT = /\A(?:
- [\x00-\x7f] |
- [\xc2-\xdf] [\x80-\xbf] |
- \xe0 [\xa0-\xbf] [\x80-\xbf] |
- [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
- \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
- [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
- \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
- )*\z/xn
+ UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8']
attr_reader :wrapped_string
alias to_s wrapped_string
@@ -308,23 +299,23 @@ module ActiveSupport #:nodoc:
def rstrip
chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, ''))
end
-
+
# Strips entire range of Unicode whitespace from the left of the string.
def lstrip
chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, ''))
end
-
+
# Strips entire range of Unicode whitespace from the right and left of the string.
def strip
rstrip.lstrip
end
-
+
# Returns the number of codepoints in the string
def size
self.class.u_unpack(@wrapped_string).size
end
alias_method :length, :size
-
+
# Reverses all characters in the string.
#
# Example:
@@ -332,7 +323,7 @@ module ActiveSupport #:nodoc:
def reverse
chars(self.class.u_unpack(@wrapped_string).reverse.pack('U*'))
end
-
+
# Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
# character.
#
@@ -647,7 +638,7 @@ module ActiveSupport #:nodoc:
string.split(//u).map do |c|
c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding)
- if !UTF8_PAT.match(c)
+ if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c)
n = c.unpack('C')[0]
n < 128 ? n.chr :
n < 160 ? [UCD.cp1252[n] || n].pack('U') :
diff --git a/activesupport/lib/active_support/multibyte/utils.rb b/activesupport/lib/active_support/multibyte/utils.rb
new file mode 100644
index 0000000000..acef84da91
--- /dev/null
+++ b/activesupport/lib/active_support/multibyte/utils.rb
@@ -0,0 +1,61 @@
+# encoding: utf-8
+
+module ActiveSupport #:nodoc:
+ module Multibyte #:nodoc:
+ if Kernel.const_defined?(:Encoding)
+ # Returns a regular expression that matches valid characters in the current encoding
+ def self.valid_character
+ VALID_CHARACTER[Encoding.default_internal.to_s]
+ end
+ else
+ def self.valid_character
+ case $KCODE
+ when 'UTF8'
+ VALID_CHARACTER['UTF-8']
+ when 'SJIS'
+ VALID_CHARACTER['Shift_JIS']
+ end
+ end
+ end
+
+ if 'string'.respond_to?(:valid_encoding?)
+ # Verifies the encoding of a string
+ def self.verify(string)
+ string.valid_encoding?
+ end
+ else
+ def self.verify(string)
+ if expression = valid_character
+ for c in string.split(//)
+ return false unless valid_character.match(c)
+ end
+ end
+ true
+ end
+ end
+
+ # Verifies the encoding of the string and raises an exception when it's not valid
+ def self.verify!(string)
+ raise EncodingError.new("Found characters with invalid encoding") unless verify(string)
+ end
+
+ if 'string'.respond_to?(:force_encoding)
+ # Removes all invalid characters from the string.
+ #
+ # Note: this method is a no-op in Ruby 1.9
+ def self.clean(string)
+ string
+ end
+ else
+ def self.clean(string)
+ if expression = valid_character
+ stripped = []; for c in string.split(//)
+ stripped << c if valid_character.match(c)
+ end; stripped.join
+ else
+ string
+ end
+ end
+ end
+ end
+end \ No newline at end of file