diff options
author | Pratik Naik <pratiknaik@gmail.com> | 2009-09-21 21:14:04 +0100 |
---|---|---|
committer | Pratik Naik <pratiknaik@gmail.com> | 2009-09-21 21:14:04 +0100 |
commit | 340be9bddd8e5902e0218a0101a40a17a4afd558 (patch) | |
tree | ef4de25f3f8eb610dc2235f0762b01cb1d464efd /activesupport/lib/active_support/multibyte | |
parent | b31cdb55422226cd45a2234a4b54986f1f611151 (diff) | |
parent | 1bbb9b2db05730194edfd7d2cef9f5fcb9d79e50 (diff) | |
download | rails-340be9bddd8e5902e0218a0101a40a17a4afd558.tar.gz rails-340be9bddd8e5902e0218a0101a40a17a4afd558.tar.bz2 rails-340be9bddd8e5902e0218a0101a40a17a4afd558.zip |
Merge commit 'mainstream/master'
Diffstat (limited to 'activesupport/lib/active_support/multibyte')
-rw-r--r-- | activesupport/lib/active_support/multibyte/chars.rb | 23 | ||||
-rw-r--r-- | activesupport/lib/active_support/multibyte/utils.rb | 61 |
2 files changed, 68 insertions, 16 deletions
diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb index 64a35dca40..579ccc124d 100644 --- a/activesupport/lib/active_support/multibyte/chars.rb +++ b/activesupport/lib/active_support/multibyte/chars.rb @@ -74,16 +74,7 @@ module ActiveSupport #:nodoc: UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/ UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/ - # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) - UTF8_PAT = /\A(?: - [\x00-\x7f] | - [\xc2-\xdf] [\x80-\xbf] | - \xe0 [\xa0-\xbf] [\x80-\xbf] | - [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] | - \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | - [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | - \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf] - )*\z/xn + UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'] attr_reader :wrapped_string alias to_s wrapped_string @@ -308,23 +299,23 @@ module ActiveSupport #:nodoc: def rstrip chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, '')) end - + # Strips entire range of Unicode whitespace from the left of the string. def lstrip chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, '')) end - + # Strips entire range of Unicode whitespace from the right and left of the string. def strip rstrip.lstrip end - + # Returns the number of codepoints in the string def size self.class.u_unpack(@wrapped_string).size end alias_method :length, :size - + # Reverses all characters in the string. # # Example: @@ -332,7 +323,7 @@ module ActiveSupport #:nodoc: def reverse chars(self.class.u_unpack(@wrapped_string).reverse.pack('U*')) end - + # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that # character. # @@ -647,7 +638,7 @@ module ActiveSupport #:nodoc: string.split(//u).map do |c| c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding) - if !UTF8_PAT.match(c) + if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c) n = c.unpack('C')[0] n < 128 ? n.chr : n < 160 ? [UCD.cp1252[n] || n].pack('U') : diff --git a/activesupport/lib/active_support/multibyte/utils.rb b/activesupport/lib/active_support/multibyte/utils.rb new file mode 100644 index 0000000000..8e47763d39 --- /dev/null +++ b/activesupport/lib/active_support/multibyte/utils.rb @@ -0,0 +1,61 @@ +# encoding: utf-8 + +module ActiveSupport #:nodoc: + module Multibyte #:nodoc: + if Kernel.const_defined?(:Encoding) + # Returns a regular expression that matches valid characters in the current encoding + def self.valid_character + VALID_CHARACTER[Encoding.default_external.to_s] + end + else + def self.valid_character + case $KCODE + when 'UTF8' + VALID_CHARACTER['UTF-8'] + when 'SJIS' + VALID_CHARACTER['Shift_JIS'] + end + end + end + + if 'string'.respond_to?(:valid_encoding?) + # Verifies the encoding of a string + def self.verify(string) + string.valid_encoding? + end + else + def self.verify(string) + if expression = valid_character + for c in string.split(//) + return false unless expression.match(c) + end + end + true + end + end + + # Verifies the encoding of the string and raises an exception when it's not valid + def self.verify!(string) + raise EncodingError.new("Found characters with invalid encoding") unless verify(string) + end + + if 'string'.respond_to?(:force_encoding) + # Removes all invalid characters from the string. + # + # Note: this method is a no-op in Ruby 1.9 + def self.clean(string) + string + end + else + def self.clean(string) + if expression = valid_character + stripped = []; for c in string.split(//) + stripped << c if expression.match(c) + end; stripped.join + else + string + end + end + end + end +end |