Simplify ActiveSupport::Multibyte and make it run on Ruby 1.9.

* Unicode methods are now defined directly on Chars instead of a handler * Updated Unicode database to Unicode 5.1.0 * Improved documentation
author: Manfred Stienstra <manfred@fngtps.com> 2008-09-21 17:21:30 +0200
committer: Manfred Stienstra <manfred@fngtps.com> 2008-09-21 17:21:30 +0200
commit: 22f75d539dca7b6f33cbf86e4e9d1944bb22731f (patch)
tree: f3c775cda7f82f5b527864adc363deb3c5eee354 /activesupport/lib/active_support/multibyte
parent: 5f83e1844c83c19cf97c6415b943c6ec3cb4bb06 (diff)
download: rails-22f75d539dca7b6f33cbf86e4e9d1944bb22731f.tar.gz
rails-22f75d539dca7b6f33cbf86e4e9d1944bb22731f.tar.bz2
rails-22f75d539dca7b6f33cbf86e4e9d1944bb22731f.zip
7 files changed, 733 insertions, 892 deletions
diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb
index de2c83f8d1..c05419bfbf 100644
--- a/activesupport/lib/active_support/multibyte/chars.rb
+++ b/activesupport/lib/active_support/multibyte/chars.rb
@@ -1,142 +1,670 @@
-require 'active_support/multibyte/handlers/utf8_handler'
-require 'active_support/multibyte/handlers/passthru_handler'
-
-# Encapsulates all the functionality related to the Chars proxy.
-module ActiveSupport::Multibyte #:nodoc:
-  # Chars enables you to work transparently with multibyte encodings in the Ruby String class without having extensive
-  # knowledge about the encoding. A Chars object accepts a string upon initialization and proxies String methods in an
-  # encoding safe manner. All the normal String methods are also implemented on the proxy.
-  #
-  # String methods are proxied through the Chars object, and can be accessed through the +chars+ method. Methods
-  # which would normally return a String object now return a Chars object so methods can be chained.
-  #
-  #   "The Perfect String  ".chars.downcase.strip.normalize # => "the perfect string"
-  #
-  # Chars objects are perfectly interchangeable with String objects as long as no explicit class checks are made.
-  # If certain methods do explicitly check the class, call +to_s+ before you pass chars objects to them.
-  #
-  #   bad.explicit_checking_method "T".chars.downcase.to_s
-  #
-  # The actual operations on the string are delegated to handlers. Theoretically handlers can be implemented for
-  # any encoding, but the default handler handles UTF-8. This handler is set during initialization, if you want to
-  # use you own handler, you can set it on the Chars class. Look at the UTF8Handler source for an example how to
-  # implement your own handler. If you your own handler to work on anything but UTF-8 you probably also
-  # want to override Chars#handler.
-  #
-  #   ActiveSupport::Multibyte::Chars.handler = MyHandler
-  #
-  # Note that a few methods are defined on Chars instead of the handler because they are defined on Object or Kernel
-  # and method_missing can't catch them.
-  class Chars
-    
-    attr_reader :string # The contained string
-    alias_method :to_s, :string
-    
-    include Comparable
-    
-    # The magic method to make String and Chars comparable
-    def to_str
-      # Using any other ways of overriding the String itself will lead you all the way from infinite loops to
-      # core dumps. Don't go there.
-      @string
-    end
+# encoding: utf-8
+
+module ActiveSupport #:nodoc:
+  module Multibyte #:nodoc:
+    # Chars enables you to work transparently with multibyte encodings in the Ruby String class without having extensive
+    # knowledge about the encoding. A Chars object accepts a string upon initialization and proxies String methods in an
+    # encoding safe manner. All the normal String methods are also implemented on the proxy.
+    #
+    # String methods are proxied through the Chars object, and can be accessed through the +mb_chars+ method. Methods
+    # which would normally return a String object now return a Chars object so methods can be chained.
+    #
+    #   "The Perfect String  ".chars.downcase.strip.normalize #=> "the perfect string"
+    #
+    # Chars objects are perfectly interchangeable with String objects as long as no explicit class checks are made.
+    # If certain methods do explicitly check the class, call +to_s+ before you pass chars objects to them.
+    #
+    #   bad.explicit_checking_method "T".chars.downcase.to_s
+    #
+    # The default Chars implementation assumes that the encoding of the string is UTF-8, if you want to handle different
+    # encodings you can write your own multibyte string handler and configure it through 
+    # ActiveSupport::Multibyte.proxy_class.
+    #
+    #   class CharsForUTF32
+    #     def size
+    #       @wrapped_string.size / 4
+    #     end
+    #
+    #     def self.accepts?(string)
+    #       string.length % 4 == 0
+    #     end
+    #   end
+    #
+    #   ActiveSupport::Multibyte.proxy_class = CharsForUTF32
+    class Chars
+      # Hangul character boundaries and properties
+      HANGUL_SBASE = 0xAC00
+      HANGUL_LBASE = 0x1100
+      HANGUL_VBASE = 0x1161
+      HANGUL_TBASE = 0x11A7
+      HANGUL_LCOUNT = 19
+      HANGUL_VCOUNT = 21
+      HANGUL_TCOUNT = 28
+      HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT
+      HANGUL_SCOUNT = 11172
+      HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT
+      HANGUL_JAMO_FIRST = 0x1100
+      HANGUL_JAMO_LAST = 0x11FF
+
+      # All the unicode whitespace
+      UNICODE_WHITESPACE = [
+        (0x0009..0x000D).to_a, # White_Space # Cc   [5] <control-0009>..<control-000D>
+        0x0020,                # White_Space # Zs       SPACE
+        0x0085,                # White_Space # Cc       <control-0085>
+        0x00A0,                # White_Space # Zs       NO-BREAK SPACE
+        0x1680,                # White_Space # Zs       OGHAM SPACE MARK
+        0x180E,                # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
+        (0x2000..0x200A).to_a, # White_Space # Zs  [11] EN QUAD..HAIR SPACE
+        0x2028,                # White_Space # Zl       LINE SEPARATOR
+        0x2029,                # White_Space # Zp       PARAGRAPH SEPARATOR
+        0x202F,                # White_Space # Zs       NARROW NO-BREAK SPACE
+        0x205F,                # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
+        0x3000,                # White_Space # Zs       IDEOGRAPHIC SPACE
+      ].flatten.freeze
+
+      # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish
+      # between little and big endian. This is not an issue in utf-8, so it must be ignored.
+      UNICODE_LEADERS_AND_TRAILERS = UNICODE_WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM
+
+      # Returns a regular expression pattern that matches the passed Unicode codepoints
+      def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
+        array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|')
+      end
+      UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/
+      UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/
+
+      # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
+      UTF8_PAT = /\A(?:
+                     [\x00-\x7f]                                     |
+                     [\xc2-\xdf] [\x80-\xbf]                         |
+                     \xe0        [\xa0-\xbf] [\x80-\xbf]             |
+                     [\xe1-\xef] [\x80-\xbf] [\x80-\xbf]             |
+                     \xf0        [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
+                     [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
+                     \xf4        [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
+                    )*\z/xn
+
+      attr_reader :wrapped_string
+      alias to_s wrapped_string
+      alias to_str wrapped_string
+
+      # Creates a new Chars instance. +string+ is the wrapped string.
+      if '1.9'.respond_to?(:force_encoding)
+        def initialize(string)
+          @wrapped_string = string
+          @wrapped_string.force_encoding(Encoding::UTF_8) unless @wrapped_string.frozen?
+        end
+      else
+        def initialize(string)
+          @wrapped_string = string
+        end
+      end
+
+      # Forward all undefined methods to the wrapped string.
+      def method_missing(method, *args, &block)
+        if method.to_s =~ /!$/
+          @wrapped_string.__send__(method, *args, &block)
+          self
+        else
+          chars(@wrapped_string.__send__(method, *args, &block))
+        end
+      end
+      
+      # Returns +true+ if _obj_ responds to the given method. Private methods are included in the search
+      # only if the optional second parameter evaluates to +true+.
+      def respond_to?(method, include_private=false)
+        super || @wrapped_string.respond_to?(method, include_private) || false
+      end
+
+      # Enable more predictable duck-typing on String-like classes. See Object#acts_like?.
+      def acts_like_string?
+        true
+      end
+
+      # Returns +true+ if the Chars class can and should act as a proxy for the string +string+. Returns
+      # +false+ otherwise.
+      def self.wants?(string)
+        RUBY_VERSION < '1.9' && $KCODE == 'UTF8' && consumes?(string)
+      end
 
-    # Make duck-typing with String possible
-    def respond_to?(method, include_priv = false)
-      super || @string.respond_to?(method, include_priv) ||
-        handler.respond_to?(method, include_priv) ||
-        (method.to_s =~ /(.*)!/ && handler.respond_to?($1, include_priv)) ||
+      # Returns +true+ when the proxy class can handle the string. Returns +false+ otherwise.
+      def self.consumes?(string)
+        # Unpack is a little bit faster than regular expressions.
+        string.unpack('U*')
+        true
+      rescue ArgumentError
         false
-    end
+      end
 
-    # Enable more predictable duck-typing on String-like classes. See Object#acts_like?.
-    def acts_like_string?
-      true
-    end
+      include Comparable
 
-    # Create a new Chars instance.
-    def initialize(str)
-      @string = str.respond_to?(:string) ? str.string : str
-    end
-    
-    # Returns -1, 0 or +1 depending on whether the Chars object is to be sorted before, equal or after the
-    # object on the right side of the operation. It accepts any object that implements +to_s+. See String.<=>
-    # for more details.
-    def <=>(other); @string <=> other.to_s; end
-    
-    # Works just like String#split, with the exception that the items in the resulting list are Chars
-    # instances instead of String. This makes chaining methods easier.
-    def split(*args)
-      @string.split(*args).map { |i| i.chars }
-    end
-    
-    # Gsub works exactly the same as gsub on a normal string.
-    def gsub(*a, &b); @string.gsub(*a, &b).chars; end
-    
-    # Like String.=~ only it returns the character offset (in codepoints) instead of the byte offset.
-    def =~(other)
-      handler.translate_offset(@string, @string =~ other)
-    end
-    
-    # Try to forward all undefined methods to the handler, when a method is not defined on the handler, send it to
-    # the contained string. Method_missing is also responsible for making the bang! methods destructive.
-    def method_missing(m, *a, &b)
-      begin
-        # Simulate methods with a ! at the end because we can't touch the enclosed string from the handlers.
-        if m.to_s =~ /^(.*)\!$/ && handler.respond_to?($1)
-          result = handler.send($1, @string, *a, &b)
-          if result == @string
-            result = nil
+      # Returns -1, 0 or +1 depending on whether the Chars object is to be sorted before, equal or after the
+      # object on the right side of the operation. It accepts any object that implements +to_s+. See String.<=>
+      # for more details.
+      #
+      # Example:
+      #   'é'.mb_chars <=> 'ü'.mb_chars #=> -1
+      def <=>(other)
+        @wrapped_string <=> other.to_s
+      end
+
+      # Returns a new Chars object containing the other object concatenated to the string.
+      #
+      # Example:
+      #   ('Café'.mb_chars + ' périferôl').to_s #=> "Café périferôl"
+      def +(other)
+        self << other
+      end
+
+      # Like String.=~ only it returns the character offset (in codepoints) instead of the byte offset.
+      #
+      # Example:
+      #   'Café périferôl'.mb_chars =~ /ô/ #=> 12
+      def =~(other)
+        translate_offset(@wrapped_string =~ other)
+      end
+
+      # Works just like String#split, with the exception that the items in the resulting list are Chars
+      # instances instead of String. This makes chaining methods easier.
+      #
+      # Example:
+      #   'Café périferôl'.mb_chars.split(/é/).map { |part| part.upcase.to_s } #=> ["CAF", " P", "RIFERÔL"]
+      def split(*args)
+        @wrapped_string.split(*args).map { |i| i.mb_chars }
+      end
+
+      # Inserts the passed string at specified codepoint offsets
+      #
+      # Example:
+      #   'Café'.mb_chars.insert(4, ' périferôl').to_s #=> "Café périferôl"
+      def insert(offset, fragment)
+        unpacked = self.class.u_unpack(@wrapped_string)
+        unless offset > unpacked.length
+          @wrapped_string.replace(
+            self.class.u_unpack(@wrapped_string).insert(offset, *self.class.u_unpack(fragment)).pack('U*')
+          )
+        else
+          raise IndexError, "index #{offset} out of string"
+        end
+        self
+      end
+
+      # Returns true if contained string contains +other+. Returns false otherwise.
+      #
+      # Example:
+      #   'Café'.mb_chars.include?('é') #=> true
+      def include?(other)
+        # We have to redefine this method because Enumerable defines it.
+        @wrapped_string.include?(other)
+      end
+
+      # Returns the position of the passed argument in the string, counting in codepoints
+      #
+      # Example:
+      #   'Café périferôl'.mb_chars.index('ô') #=> 12
+      def index(*args)
+        index = @wrapped_string.index(*args)
+        index ? (self.class.u_unpack(@wrapped_string.slice(0...index)).size) : nil
+      end
+
+      # Works just like the indexed replace method on string, except instead of byte offsets you specify
+      # character offsets.
+      #
+      # Example:
+      #
+      #   s = "Müller"
+      #   s.chars[2] = "e" # Replace character with offset 2
+      #   s
+      #   #=> "Müeler"
+      #
+      #   s = "Müller"
+      #   s.chars[1, 2] = "ö" # Replace 2 characters at character offset 1
+      #   s
+      #   #=> "Möler"
+      def []=(*args)
+        replace_by = args.pop
+        # Indexed replace with regular expressions already works
+        if args.first.is_a?(Regexp)
+          @wrapped_string[*args] = replace_by
+        else
+          result = self.class.u_unpack(@wrapped_string)
+          if args[0].is_a?(Fixnum)
+            raise IndexError, "index #{args[0]} out of string" if args[0] >= result.length
+            min = args[0]
+            max = args[1].nil? ? min : (min + args[1] - 1)
+            range = Range.new(min, max)
+            replace_by = [replace_by].pack('U') if replace_by.is_a?(Fixnum)
+          elsif args.first.is_a?(Range)
+            raise RangeError, "#{args[0]} out of range" if args[0].min >= result.length
+            range = args[0]
           else
-            @string.replace result
+            needle = args[0].to_s
+            min = index(needle)
+            max = min + self.class.u_unpack(needle).length - 1
+            range = Range.new(min, max)
           end
-        elsif handler.respond_to?(m)
-          result = handler.send(m, @string, *a, &b)
-        else
-          result = @string.send(m, *a, &b)
+          result[range] = self.class.u_unpack(replace_by)
+          @wrapped_string.replace(result.pack('U*'))
         end
-      rescue Handlers::EncodingError
-        @string.replace handler.tidy_bytes(@string)
-        retry
+        self
       end
-      
-      if result.kind_of?(String)
-        result.chars
-      else
-        result
+
+      # Works just like String#rjust, only integer specifies characters instead of bytes.
+      #
+      # Example:
+      #
+      #   "¾ cup".chars.rjust(8).to_s
+      #   #=> "   ¾ cup"
+      #
+      #   "¾ cup".chars.rjust(8, " ").to_s # Use non-breaking whitespace
+      #   #=> "   ¾ cup"
+      def rjust(integer, padstr=' ')
+        justify(integer, :right, padstr)
       end
-    end
-    
-    # Set the handler class for the Char objects.
-    def self.handler=(klass)
-      @@handler = klass
-    end
 
-    # Returns the proper handler for the contained string depending on $KCODE and the encoding of the string. This
-    # method is used internally to always redirect messages to the proper classes depending on the context.
-    def handler
-      if utf8_pragma?
-        @@handler
-      else
-        ActiveSupport::Multibyte::Handlers::PassthruHandler
+      # Works just like String#ljust, only integer specifies characters instead of bytes.
+      #
+      # Example:
+      #
+      #   "¾ cup".chars.rjust(8).to_s
+      #   #=> "¾ cup   "
+      #
+      #   "¾ cup".chars.rjust(8, " ").to_s # Use non-breaking whitespace
+      #   #=> "¾ cup   "
+      def ljust(integer, padstr=' ')
+        justify(integer, :left, padstr)
+      end
+
+      # Works just like String#center, only integer specifies characters instead of bytes.
+      #
+      # Example:
+      #
+      #   "¾ cup".chars.center(8).to_s
+      #   #=> " ¾ cup  "
+      #
+      #   "¾ cup".chars.center(8, " ").to_s # Use non-breaking whitespace
+      #   #=> " ¾ cup  "
+      def center(integer, padstr=' ')
+        justify(integer, :center, padstr)
       end
-    end
 
-    private
+      # Strips entire range of Unicode whitespace from the right of the string.
+      def rstrip
+        chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, ''))
+      end
+      
+      # Strips entire range of Unicode whitespace from the left of the string.
+      def lstrip
+        chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, ''))
+      end
+      
+      # Strips entire range of Unicode whitespace from the right and left of the string.
+      def strip
+        rstrip.lstrip
+      end
+      
+      # Returns the number of codepoints in the string
+      def size
+        self.class.u_unpack(@wrapped_string).size
+      end
+      alias_method :length, :size
       
-      # +utf8_pragma+ checks if it can send this string to the handlers. It makes sure @string isn't nil and $KCODE is
-      # set to 'UTF8'.
-      def utf8_pragma?
-        !@string.nil? && ($KCODE == 'UTF8')
+      # Reverses all characters in the string
+      #
+      # Example:
+      #   'Café'.mb_chars.reverse.to_s #=> 'éfaC'
+      def reverse
+        chars(self.class.u_unpack(@wrapped_string).reverse.pack('U*'))
       end
+      
+      # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
+      # character.
+      #
+      # Example:
+      #   'こにちわ'.mb_chars.slice(2..3).to_s #=> "ちわ"
+      def slice(*args)
+        if args.size > 2
+          raise ArgumentError, "wrong number of arguments (#{args.size} for 1)" # Do as if we were native
+        elsif (args.size == 2 && !(args.first.is_a?(Numeric) || args.first.is_a?(Regexp)))
+          raise TypeError, "cannot convert #{args.first.class} into Integer" # Do as if we were native
+        elsif (args.size == 2 && !args[1].is_a?(Numeric))
+          raise TypeError, "cannot convert #{args[1].class} into Integer" # Do as if we were native
+        elsif args[0].kind_of? Range
+          cps = self.class.u_unpack(@wrapped_string).slice(*args)
+          result = cps.nil? ? nil : cps.pack('U*')
+        elsif args[0].kind_of? Regexp
+          result = @wrapped_string.slice(*args)
+        elsif args.size == 1 && args[0].kind_of?(Numeric)
+          character = self.class.u_unpack(@wrapped_string)[args[0]]
+          result = character.nil? ? nil : [character].pack('U')
+        else
+          result = self.class.u_unpack(@wrapped_string).slice(*args).pack('U*')
+        end
+        result.nil? ? nil : chars(result)
+      end
+      alias_method :[], :slice
+
+      # Convert characters in the string to uppercase
+      #
+      # Example:
+      #   'Laurent, òu sont les tests?'.mb_chars.upcase.to_s #=> "LAURENT, ÒU SONT LES TESTS?"
+      def upcase
+        apply_mapping :uppercase_mapping
+      end
+
+      # Convert characters in the string to lowercase
+      #
+      # Example:
+      #   'VĚDA A VÝZKUM'.mb_chars.downcase.to_s #=> "věda a výzkum"
+      def downcase
+        apply_mapping :lowercase_mapping
+      end
+
+      # Converts the first character to uppercase and the remainder to lowercase
+      #
+      # Example:
+      #  'über'.mb_chars.capitalize.to_s #=> "Über"
+      def capitalize
+        (slice(0) || '').upcase + (slice(1..-1) || '').downcase
+      end
+
+      # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for
+      # passing strings to databases and validations.
+      #
+      # * <tt>str</tt> - The string to perform normalization on.
+      # * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
+      #   <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
+      #   ActiveSupport::Multibyte.default_normalization_form
+      def normalize(form=ActiveSupport::Multibyte.default_normalization_form)
+        # See http://www.unicode.org/reports/tr15, Table 1
+        codepoints = self.class.u_unpack(@wrapped_string)
+        chars(case form
+          when :d
+            self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints))
+          when :c
+            self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints)))
+          when :kd
+            self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints))
+          when :kc
+            self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints)))
+          else
+            raise ArgumentError, "#{form} is not a valid normalization variant", caller
+        end.pack('U*'))
+      end
+
+      # Performs canonical decomposition on all the characters.
+      #
+      # Example:
+      #   'é'.length #=> 2
+      #   'é'.mb_chars.decompose.to_s.length #=> 3
+      def decompose
+        chars(self.class.decompose_codepoints(:canonical, self.class.u_unpack(@wrapped_string)).pack('U*'))
+      end
+
+      # Performs composition on all the characters.
+      #
+      # Example:
+      #   'é'.length #=> 3
+      #   'é'.mb_chars.compose.to_s.length #=> 2
+      def compose
+        chars(self.class.compose_codepoints(self.class.u_unpack(@wrapped_string)).pack('U*'))
+      end
+
+      # Returns the number of grapheme clusters in the string.
+      #
+      # Example:
+      #   'क्षि'.mb_chars.length #=> 4
+      #   'क्षि'.mb_chars.g_length #=> 3
+      def g_length
+        self.class.g_unpack(@wrapped_string).length
+      end
+
+      def tidy_bytes
+        chars(self.class.tidy_bytes(@wrapped_string))
+      end
+
+      %w(lstrip rstrip strip reverse upcase downcase slice tidy_bytes capitalize).each do |method|
+        define_method("#{method}!") do |*args|
+          unless args.nil?
+            @wrapped_string = send(method, *args).to_s
+          else
+            @wrapped_string = send(method).to_s
+          end
+          self
+        end
+      end
+
+      class << self
+
+        # Unpack the string at codepoints boundaries
+        def u_unpack(str)
+          begin
+            str.unpack 'U*'
+          rescue ArgumentError
+            raise EncodingError.new('malformed UTF-8 character')
+          end
+        end
+
+        # Detect whether the codepoint is in a certain character class. Primarily used by the
+        # grapheme cluster support.
+        def in_char_class?(codepoint, classes)
+          classes.detect { |c| UCD.boundary[c] === codepoint } ? true : false
+        end
+
+        # Unpack the string at grapheme boundaries
+        def g_unpack(str)
+          codepoints = u_unpack(str)
+          unpacked = []
+          pos = 0
+          marker = 0
+          eoc = codepoints.length
+          while(pos < eoc)
+            pos += 1
+            previous = codepoints[pos-1]
+            current = codepoints[pos]
+            if (
+                # CR X LF
+                one = ( previous == UCD.boundary[:cr] and current == UCD.boundary[:lf] ) or
+                # L X (L|V|LV|LVT)
+                two = ( UCD.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or
+                # (LV|V) X (V|T)
+                three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or
+                # (LVT|T) X (T)
+                four = ( in_char_class?(previous, [:lvt,:t]) and UCD.boundary[:t] === current ) or
+                # X Extend
+                five = (UCD.boundary[:extend] === current)
+              )
+            else
+              unpacked << codepoints[marker..pos-1]
+              marker = pos
+            end
+          end 
+          unpacked
+        end
+
+        # Reverse operation of g_unpack
+        def g_pack(unpacked)
+          (unpacked.flatten).pack('U*')
+        end
+
+        # Generates a padding string of a certain size.
+        def padding(padsize, padstr=' ')
+          if padsize != 0
+            new(padstr * ((padsize / u_unpack(padstr).size) + 1)).slice(0, padsize)
+          else
+            ''
+          end
+        end
+
+        # Re-order codepoints so the string becomes canonical
+        def reorder_characters(codepoints)
+          length = codepoints.length- 1
+          pos = 0
+          while pos < length do
+            cp1, cp2 = UCD.codepoints[codepoints[pos]], UCD.codepoints[codepoints[pos+1]]
+            if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0)
+              codepoints[pos..pos+1] = cp2.code, cp1.code
+              pos += (pos > 0 ? -1 : 1)
+            else
+              pos += 1
+            end
+          end
+          codepoints
+        end
+
+        # Decompose composed characters to the decomposed form
+        def decompose_codepoints(type, codepoints)
+          codepoints.inject([]) do |decomposed, cp|
+            # if it's a hangul syllable starter character
+            if HANGUL_SBASE <= cp and cp < HANGUL_SLAST
+              sindex = cp - HANGUL_SBASE
+              ncp = [] # new codepoints
+              ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT
+              ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
+              tindex = sindex % HANGUL_TCOUNT
+              ncp << (HANGUL_TBASE + tindex) unless tindex == 0
+              decomposed.concat ncp
+            # if the codepoint is decomposable in with the current decomposition type
+            elsif (ncp = UCD.codepoints[cp].decomp_mapping) and (!UCD.codepoints[cp].decomp_type || type == :compatability)
+              decomposed.concat decompose_codepoints(type, ncp.dup)
+            else
+              decomposed << cp
+            end
+          end
+        end
+
+        # Compose decomposed characters to the composed form
+        def compose_codepoints(codepoints)
+          pos = 0
+          eoa = codepoints.length - 1
+          starter_pos = 0
+          starter_char = codepoints[0]
+          previous_combining_class = -1
+          while pos < eoa
+            pos += 1
+            lindex = starter_char - HANGUL_LBASE
+            # -- Hangul
+            if 0 <= lindex and lindex < HANGUL_LCOUNT
+              vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1
+              if 0 <= vindex and vindex < HANGUL_VCOUNT
+                tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1
+                if 0 <= tindex and tindex < HANGUL_TCOUNT
+                  j = starter_pos + 2
+                  eoa -= 2
+                else
+                  tindex = 0
+                  j = starter_pos + 1
+                  eoa -= 1
+                end
+                codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE
+              end
+              starter_pos += 1
+              starter_char = codepoints[starter_pos]
+            # -- Other characters
+            else
+              current_char = codepoints[pos]
+              current = UCD.codepoints[current_char]
+              if current.combining_class > previous_combining_class
+                if ref = UCD.composition_map[starter_char]
+                  composition = ref[current_char]
+                else
+                  composition = nil
+                end
+                unless composition.nil?
+                  codepoints[starter_pos] = composition
+                  starter_char = composition
+                  codepoints.delete_at pos
+                  eoa -= 1
+                  pos -= 1
+                  previous_combining_class = -1
+                else
+                  previous_combining_class = current.combining_class
+                end
+              else
+                previous_combining_class = current.combining_class
+              end
+              if current.combining_class == 0
+                starter_pos = pos
+                starter_char = codepoints[pos]
+              end
+            end
+          end
+          codepoints
+        end
+
+        # Replaces all the non-UTF-8 bytes by their iso-8859-1 or cp1252 equivalent resulting in a valid UTF-8 string
+        def tidy_bytes(str)
+          str.split(//u).map do |c|
+            if !UTF8_PAT.match(c)
+              n = c.unpack('C')[0]
+              n < 128 ? n.chr :
+              n < 160 ? [UCD.cp1252[n] || n].pack('U') :
+              n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr
+            else
+              c
+            end
+          end.join
+        end
+      end
+
+      protected
+
+        # Translate a byte offset in the wrapped string to a character offset by looking for the character boundary
+        def translate_offset(byte_offset)
+          return nil if byte_offset.nil?
+          return 0   if @wrapped_string == ''
+          chunk = @wrapped_string[0..byte_offset]
+          begin
+            begin
+              chunk.unpack('U*').length - 1
+            rescue ArgumentError => e
+              chunk = @wrapped_string[0..(byte_offset+=1)]
+              # Stop retrying at the end of the string
+              raise e unless byte_offset < chunk.length 
+              # We damaged a character, retry
+              retry
+            end
+          # Catch the ArgumentError so we can throw our own
+          rescue ArgumentError 
+            raise EncodingError, 'malformed UTF-8 character'
+          end
+        end
+
+        # Justifies a string in a certain way. Valid values for <tt>way</tt> are <tt>:right</tt>, <tt>:left</tt> and
+        # <tt>:center</tt>.
+        def justify(integer, way, padstr=' ')
+          raise ArgumentError, "zero width padding" if padstr.length == 0
+          padsize = integer - size
+          padsize = padsize > 0 ? padsize : 0
+          case way
+          when :right
+            result = @wrapped_string.dup.insert(0, self.class.padding(padsize, padstr))
+          when :left
+            result = @wrapped_string.dup.insert(-1, self.class.padding(padsize, padstr))
+          when :center
+            lpad = self.class.padding((padsize / 2.0).floor, padstr)
+            rpad = self.class.padding((padsize / 2.0).ceil, padstr)
+            result = @wrapped_string.dup.insert(0, lpad).insert(-1, rpad)
+          end
+          chars(result)
+        end
+
+        # Map codepoints to one of it's attributes.
+        def apply_mapping(mapping)
+          chars(self.class.u_unpack(@wrapped_string).map do |codepoint|
+            cp = UCD.codepoints[codepoint]
+            if cp and (ncp = cp.send(mapping)) and ncp > 0
+              ncp
+            else
+              codepoint
+            end
+          end.pack('U*'))
+        end
+
+        # Creates a new instance
+        def chars(str)
+          self.class.new(str)
+        end
+    end
   end
-end
-
-# When we can load the utf8proc library, override normalization with the faster methods
-begin
-  require 'utf8proc_native'
-  require 'active_support/multibyte/handlers/utf8_handler_proc'
-  ActiveSupport::Multibyte::Chars.handler = ActiveSupport::Multibyte::Handlers::UTF8HandlerProc
-rescue LoadError
-  ActiveSupport::Multibyte::Chars.handler = ActiveSupport::Multibyte::Handlers::UTF8Handler
-end
+end
+\ No newline at end of file
diff --git a/activesupport/lib/active_support/multibyte/exceptions.rb b/activesupport/lib/active_support/multibyte/exceptions.rb
new file mode 100644
index 0000000000..af760cc561
--- /dev/null
+++ b/activesupport/lib/active_support/multibyte/exceptions.rb
@@ -0,0 +1,7 @@
+# encoding: utf-8
+
+module ActiveSupport #:nodoc:
+  module Multibyte #:nodoc:
+    class EncodingError < StandardError; end
+  end
+end
+\ No newline at end of file
diff --git a/activesupport/lib/active_support/multibyte/generators/generate_tables.rb b/activesupport/lib/active_support/multibyte/generators/generate_tables.rb
deleted file mode 100755
index 7f807585c5..0000000000
--- a/activesupport/lib/active_support/multibyte/generators/generate_tables.rb
+++ /dev/null
@@ -1,149 +0,0 @@
-#!/usr/bin/env ruby
-begin
-  require File.dirname(__FILE__) + '/../../../active_support'
-rescue IOError
-end
-require 'open-uri'
-require 'tmpdir'
-
-module ActiveSupport::Multibyte::Handlers #:nodoc:
-  class UnicodeDatabase #:nodoc:
-    def self.load
-      [Hash.new(Codepoint.new),[],{},{}]
-    end
-  end
-  
-  class UnicodeTableGenerator #:nodoc:
-    BASE_URI = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::UNICODE_VERSION}/ucd/"
-    SOURCES = {
-      :codepoints => BASE_URI + 'UnicodeData.txt',
-      :composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
-      :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
-      :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
-    }
-    
-    def initialize
-      @ucd = UnicodeDatabase.new
-      
-      default = Codepoint.new
-      default.combining_class = 0
-      default.uppercase_mapping = 0
-      default.lowercase_mapping = 0
-      @ucd.codepoints = Hash.new(default)
-      
-      @ucd.composition_exclusion = []
-      @ucd.composition_map = {}
-      @ucd.boundary = {}
-      @ucd.cp1252 = {}
-    end
-    
-    def parse_codepoints(line)
-      codepoint = Codepoint.new
-      raise "Could not parse input." unless line =~ /^
-        ([0-9A-F]+);        # code
-        ([^;]+);            # name
-        ([A-Z]+);           # general category
-        ([0-9]+);           # canonical combining class
-        ([A-Z]+);           # bidi class
-        (<([A-Z]*)>)?       # decomposition type
-        ((\ ?[0-9A-F]+)*);  # decompomposition mapping
-        ([0-9]*);           # decimal digit
-        ([0-9]*);           # digit
-        ([^;]*);            # numeric
-        ([YN]*);            # bidi mirrored
-        ([^;]*);            # unicode 1.0 name
-        ([^;]*);            # iso comment
-        ([0-9A-F]*);        # simple uppercase mapping
-        ([0-9A-F]*);        # simple lowercase mapping
-        ([0-9A-F]*)$/ix     # simple titlecase mapping
-      codepoint.code              = $1.hex
-      #codepoint.name              = $2
-      #codepoint.category          = $3
-      codepoint.combining_class   = Integer($4)
-      #codepoint.bidi_class        = $5
-      codepoint.decomp_type       = $7
-      codepoint.decomp_mapping    = ($8=='') ? nil : $8.split.collect { |element| element.hex }
-      #codepoint.bidi_mirrored     = ($13=='Y') ? true : false
-      codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex
-      codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex
-      #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex
-      @ucd.codepoints[codepoint.code] = codepoint
-    end
-
-    def parse_grapheme_break_property(line)
-      if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/
-        type = $2.downcase.intern
-        @ucd.boundary[type] ||= []
-        if $1.include? '..'
-          parts = $1.split '..'
-          @ucd.boundary[type] << (parts[0].hex..parts[1].hex)
-        else
-          @ucd.boundary[type] << $1.hex
-        end
-      end
-    end
-    
-    def parse_composition_exclusion(line)
-      if line =~ /^([0-9A-F]+)/i
-        @ucd.composition_exclusion << $1.hex
-      end
-    end
-    
-    def parse_cp1252(line)
-      if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
-        @ucd.cp1252[$1.hex] = $2.hex
-      end
-    end
-    
-    def create_composition_map
-      @ucd.codepoints.each do |_, cp|
-        if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
-          @ucd.composition_map[cp.decomp_mapping[0]] ||= {}
-          @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
-        end
-      end
-    end
-
-    def normalize_boundary_map
-      @ucd.boundary.each do |k,v|
-        if [:lf, :cr].include? k
-          @ucd.boundary[k] = v[0]
-        end
-      end
-    end
-  
-    def parse
-      SOURCES.each do |type, url|
-        filename =  File.join(Dir.tmpdir, "#{url.split('/').last}")
-        unless File.exist?(filename)
-          $stderr.puts "Downloading #{url.split('/').last}"
-          File.open(filename, 'wb') do |target|
-            open(url) do |source|
-              source.each_line { |line| target.write line }
-            end
-          end
-        end
-        File.open(filename) do |file|
-          file.each_line { |line| send "parse_#{type}".intern, line }
-        end        
-      end
-      create_composition_map
-      normalize_boundary_map
-    end
-    
-    def dump_to(filename)
-      File.open(filename, 'wb') do |f|
-        f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
-      end
-    end
-  end
-end
-
-if __FILE__ == $0
-  filename = ActiveSupport::Multibyte::Handlers::UnicodeDatabase.filename
-  generator = ActiveSupport::Multibyte::Handlers::UnicodeTableGenerator.new
-  generator.parse
-  print "Writing to: #{filename}"
-  generator.dump_to filename
-  puts " (#{File.size(filename)} bytes)"
-end
diff --git a/activesupport/lib/active_support/multibyte/handlers/passthru_handler.rb b/activesupport/lib/active_support/multibyte/handlers/passthru_handler.rb
deleted file mode 100644
index 916215c2ce..0000000000
--- a/activesupport/lib/active_support/multibyte/handlers/passthru_handler.rb
+++ /dev/null
@@ -1,9 +0,0 @@
-# Chars uses this handler when $KCODE is not set to 'UTF8'. Because this handler doesn't define any methods all call
-# will be forwarded to String.
-class ActiveSupport::Multibyte::Handlers::PassthruHandler #:nodoc:
-  
-  # Return the original byteoffset
-  def self.translate_offset(string, byte_offset) #:nodoc:
-    byte_offset
-  end
-end
-\ No newline at end of file
diff --git a/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb b/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb
deleted file mode 100644
index aa9c16f575..0000000000
--- a/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb
+++ /dev/null
@@ -1,564 +0,0 @@
-# Contains all the handlers and helper classes
-module ActiveSupport::Multibyte::Handlers #:nodoc:
-  class EncodingError < ArgumentError #:nodoc:
-  end
-  
-  class Codepoint #:nodoc:
-    attr_accessor :code, :combining_class, :decomp_type, :decomp_mapping, :uppercase_mapping, :lowercase_mapping
-  end
-  
-  class UnicodeDatabase #:nodoc:
-    attr_writer :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252
-    
-    # self-expiring methods that lazily load the Unicode database and then return the value.
-    [:codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252].each do |attr_name|
-      class_eval(<<-EOS, __FILE__, __LINE__)
-        def #{attr_name}
-          load
-          @#{attr_name}
-        end
-      EOS
-    end
-    
-    # Shortcut to ucd.codepoints[]
-    def [](index); codepoints[index]; end
-    
-    # Returns the directory in which the data files are stored
-    def self.dirname
-      File.dirname(__FILE__) + '/../../values/'
-    end
-    
-    # Returns the filename for the data file for this version
-    def self.filename
-      File.expand_path File.join(dirname, "unicode_tables.dat")
-    end
-    
-    # Loads the unicode database and returns all the internal objects of UnicodeDatabase
-    # Once the values have been loaded, define attr_reader methods for the instance variables.
-    def load
-      begin
-        @codepoints, @composition_exclusion, @composition_map, @boundary, @cp1252 = File.open(self.class.filename, 'rb') { |f| Marshal.load f.read }
-      rescue Exception => e
-          raise IOError.new("Couldn't load the unicode tables for UTF8Handler (#{e.message}), handler is unusable")
-      end
-      @codepoints ||= Hash.new(Codepoint.new)
-      @composition_exclusion ||= []
-      @composition_map ||= {}
-      @boundary ||= {}
-      @cp1252 ||= {}
-      
-      # Redefine the === method so we can write shorter rules for grapheme cluster breaks
-      @boundary.each do |k,_|
-        @boundary[k].instance_eval do
-          def ===(other)
-            detect { |i| i === other } ? true : false
-          end
-        end if @boundary[k].kind_of?(Array)
-      end
-
-      # define attr_reader methods for the instance variables
-      class << self
-        attr_reader :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252
-      end
-    end
-  end
-  
-  # UTF8Handler implements Unicode aware operations for strings, these operations will be used by the Chars
-  # proxy when $KCODE is set to 'UTF8'.
-  class UTF8Handler
-    # Hangul character boundaries and properties
-    HANGUL_SBASE = 0xAC00
-    HANGUL_LBASE = 0x1100
-    HANGUL_VBASE = 0x1161
-    HANGUL_TBASE = 0x11A7
-    HANGUL_LCOUNT = 19
-    HANGUL_VCOUNT = 21
-    HANGUL_TCOUNT = 28
-    HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT
-    HANGUL_SCOUNT = 11172
-    HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT
-    HANGUL_JAMO_FIRST = 0x1100
-    HANGUL_JAMO_LAST = 0x11FF
-    
-    # All the unicode whitespace
-    UNICODE_WHITESPACE = [
-      (0x0009..0x000D).to_a,  # White_Space # Cc   [5] <control-0009>..<control-000D>
-      0x0020,          # White_Space # Zs       SPACE
-      0x0085,          # White_Space # Cc       <control-0085>
-      0x00A0,          # White_Space # Zs       NO-BREAK SPACE
-      0x1680,          # White_Space # Zs       OGHAM SPACE MARK
-      0x180E,          # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
-      (0x2000..0x200A).to_a, # White_Space # Zs  [11] EN QUAD..HAIR SPACE
-      0x2028,          # White_Space # Zl       LINE SEPARATOR
-      0x2029,          # White_Space # Zp       PARAGRAPH SEPARATOR
-      0x202F,          # White_Space # Zs       NARROW NO-BREAK SPACE
-      0x205F,          # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
-      0x3000,          # White_Space # Zs       IDEOGRAPHIC SPACE
-    ].flatten.freeze
-    
-    # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish
-    # between little and big endian. This is not an issue in utf-8, so it must be ignored.
-    UNICODE_LEADERS_AND_TRAILERS = UNICODE_WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM
-    
-    # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
-     UTF8_PAT = /\A(?:
-                   [\x00-\x7f]                                     |
-                   [\xc2-\xdf] [\x80-\xbf]                         |
-                   \xe0        [\xa0-\xbf] [\x80-\xbf]             |
-                   [\xe1-\xef] [\x80-\xbf] [\x80-\xbf]             |
-                   \xf0        [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
-                   [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
-                   \xf4        [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
-                  )*\z/xn
-    
-    # Returns a regular expression pattern that matches the passed Unicode codepoints
-    def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
-      array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|') 
-    end
-    UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/
-    UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/
-    
-    class << self
-      
-      # ///
-      # /// BEGIN String method overrides
-      # ///
-      
-      # Inserts the passed string at specified codepoint offsets
-      def insert(str, offset, fragment)
-        str.replace(
-          u_unpack(str).insert(
-            offset,
-            u_unpack(fragment)
-          ).flatten.pack('U*')
-        )
-      end
-      
-      # Returns the position of the passed argument in the string, counting in codepoints
-      def index(str, *args)
-        bidx = str.index(*args)
-        bidx ? (u_unpack(str.slice(0...bidx)).size) : nil
-      end
-      
-      # Works just like the indexed replace method on string, except instead of byte offsets you specify
-      # character offsets.
-      #
-      # Example:
-      #
-      #   s = "Müller"
-      #   s.chars[2] = "e" # Replace character with offset 2
-      #   s # => "Müeler"
-      #
-      #   s = "Müller"
-      #   s.chars[1, 2] = "ö" # Replace 2 characters at character offset 1
-      #   s # => "Möler"
-      def []=(str, *args)
-        replace_by = args.pop
-        # Indexed replace with regular expressions already works
-        return str[*args] = replace_by if args.first.is_a?(Regexp)
-        result = u_unpack(str)
-        if args[0].is_a?(Fixnum)
-          raise IndexError, "index #{args[0]} out of string" if args[0] >= result.length
-          min = args[0]
-          max = args[1].nil? ? min : (min + args[1] - 1)
-          range = Range.new(min, max)
-          replace_by = [replace_by].pack('U') if replace_by.is_a?(Fixnum)
-        elsif args.first.is_a?(Range)
-          raise RangeError, "#{args[0]} out of range" if args[0].min >= result.length
-          range = args[0]
-        else
-          needle = args[0].to_s
-          min = index(str, needle)
-          max = min + length(needle) - 1
-          range = Range.new(min, max)
-        end
-        result[range] = u_unpack(replace_by)
-        str.replace(result.pack('U*'))
-      end
-      
-      # Works just like String#rjust, only integer specifies characters instead of bytes.
-      #
-      # Example:
-      #
-      #   "¾ cup".chars.rjust(8).to_s
-      #   # => "   ¾ cup"
-      #
-      #   "¾ cup".chars.rjust(8, " ").to_s # Use non-breaking whitespace
-      #   # => "   ¾ cup"
-      def rjust(str, integer, padstr=' ')
-        justify(str, integer, :right, padstr)
-      end
-      
-      # Works just like String#ljust, only integer specifies characters instead of bytes.
-      #
-      # Example:
-      #
-      #   "¾ cup".chars.rjust(8).to_s
-      #   # => "¾ cup   "
-      #
-      #   "¾ cup".chars.rjust(8, " ").to_s # Use non-breaking whitespace
-      #   # => "¾ cup   "
-      def ljust(str, integer, padstr=' ')
-        justify(str, integer, :left, padstr)
-      end
-      
-      # Works just like String#center, only integer specifies characters instead of bytes.
-      #
-      # Example:
-      #
-      #   "¾ cup".chars.center(8).to_s
-      #   # => " ¾ cup  "
-      #
-      #   "¾ cup".chars.center(8, " ").to_s # Use non-breaking whitespace
-      #   # => " ¾ cup  "
-      def center(str, integer, padstr=' ')
-        justify(str, integer, :center, padstr)
-      end
-      
-      # Does Unicode-aware rstrip
-      def rstrip(str)
-        str.gsub(UNICODE_TRAILERS_PAT, '')
-      end
-      
-      # Does Unicode-aware lstrip
-      def lstrip(str)
-        str.gsub(UNICODE_LEADERS_PAT, '')
-      end
-      
-      # Removed leading and trailing whitespace
-      def strip(str)
-        str.gsub(UNICODE_LEADERS_PAT, '').gsub(UNICODE_TRAILERS_PAT, '')
-      end
-      
-      # Returns the number of codepoints in the string
-      def size(str)
-        u_unpack(str).size
-      end
-      alias_method :length, :size
-      
-      # Reverses codepoints in the string.
-      def reverse(str)
-        u_unpack(str).reverse.pack('U*')
-      end
-      
-      # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
-      # character.
-      def slice(str, *args)
-        if args.size > 2
-          raise ArgumentError, "wrong number of arguments (#{args.size} for 1)" # Do as if we were native
-        elsif (args.size == 2 && !(args.first.is_a?(Numeric) || args.first.is_a?(Regexp)))
-          raise TypeError, "cannot convert #{args.first.class} into Integer" # Do as if we were native
-        elsif (args.size == 2 && !args[1].is_a?(Numeric))
-          raise TypeError, "cannot convert #{args[1].class} into Integer" # Do as if we were native
-        elsif args[0].kind_of? Range
-          cps = u_unpack(str).slice(*args)
-          cps.nil? ? nil : cps.pack('U*')
-        elsif args[0].kind_of? Regexp
-          str.slice(*args)
-        elsif args.size == 1 && args[0].kind_of?(Numeric)
-          u_unpack(str)[args[0]]
-        else
-          u_unpack(str).slice(*args).pack('U*')
-        end
-      end
-      alias_method :[], :slice
-      
-      # Convert characters in the string to uppercase
-      def upcase(str); to_case :uppercase_mapping, str; end
-      
-      # Convert characters in the string to lowercase
-      def downcase(str); to_case :lowercase_mapping, str; end
-      
-      # Returns a copy of +str+ with the first character converted to uppercase and the remainder to lowercase
-      def capitalize(str)
-        upcase(slice(str, 0..0)) + downcase(slice(str, 1..-1) || '')
-      end
-      
-      # ///
-      # /// Extra String methods for unicode operations
-      # ///
-      
-      # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for
-      # passing strings to databases and validations.
-      #
-      # * <tt>str</tt> - The string to perform normalization on.
-      # * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
-      #   <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
-      #   ActiveSupport::Multibyte::DEFAULT_NORMALIZATION_FORM.
-      def normalize(str, form=ActiveSupport::Multibyte::DEFAULT_NORMALIZATION_FORM)
-        # See http://www.unicode.org/reports/tr15, Table 1
-        codepoints = u_unpack(str)
-        case form
-          when :d
-            reorder_characters(decompose_codepoints(:canonical, codepoints))
-          when :c
-            compose_codepoints reorder_characters(decompose_codepoints(:canonical, codepoints))
-          when :kd
-            reorder_characters(decompose_codepoints(:compatability, codepoints))
-          when :kc
-            compose_codepoints reorder_characters(decompose_codepoints(:compatability, codepoints))
-          else
-            raise ArgumentError, "#{form} is not a valid normalization variant", caller
-        end.pack('U*')
-      end
-      
-      # Perform decomposition on the characters in the string
-      def decompose(str)
-        decompose_codepoints(:canonical, u_unpack(str)).pack('U*')
-      end
-      
-      # Perform composition on the characters in the string
-      def compose(str)
-        compose_codepoints u_unpack(str).pack('U*')
-      end
-      
-      # ///
-      # /// BEGIN Helper methods for unicode operation
-      # ///
-      
-      # Used to translate an offset from bytes to characters, for instance one received from a regular expression match
-      def translate_offset(str, byte_offset)
-        return nil if byte_offset.nil?
-        return 0 if str == ''
-        chunk = str[0..byte_offset]
-        begin
-          begin
-            chunk.unpack('U*').length - 1
-          rescue ArgumentError => e
-            chunk = str[0..(byte_offset+=1)]
-            # Stop retrying at the end of the string
-            raise e unless byte_offset < chunk.length 
-            # We damaged a character, retry
-            retry
-          end
-        # Catch the ArgumentError so we can throw our own
-        rescue ArgumentError 
-          raise EncodingError.new('malformed UTF-8 character')
-        end
-      end
-      
-      # Checks if the string is valid UTF8.
-      def consumes?(str)
-        # Unpack is a little bit faster than regular expressions
-        begin
-          str.unpack('U*')
-          true
-        rescue ArgumentError
-          false
-        end
-      end
-      
-      # Returns the number of grapheme clusters in the string. This method is very likely to be moved or renamed
-      # in future versions.
-      def g_length(str)
-        g_unpack(str).length
-      end
-      
-      # Replaces all the non-utf-8 bytes by their iso-8859-1 or cp1252 equivalent resulting in a valid utf-8 string
-      def tidy_bytes(str)
-        str.split(//u).map do |c|
-          if !UTF8_PAT.match(c)
-            n = c.unpack('C')[0]
-            n < 128 ? n.chr :
-            n < 160 ? [UCD.cp1252[n] || n].pack('U') :
-            n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr
-          else
-            c
-          end
-        end.join
-      end
-      
-      protected
-      
-      # Detect whether the codepoint is in a certain character class. Primarily used by the
-      # grapheme cluster support.
-      def in_char_class?(codepoint, classes)
-        classes.detect { |c| UCD.boundary[c] === codepoint } ? true : false
-      end
-      
-      # Unpack the string at codepoints boundaries
-      def u_unpack(str)
-        begin
-          str.unpack 'U*'
-        rescue ArgumentError
-          raise EncodingError.new('malformed UTF-8 character')
-        end
-      end
-      
-      # Unpack the string at grapheme boundaries instead of codepoint boundaries
-      def g_unpack(str)
-        codepoints = u_unpack(str)
-        unpacked = []
-        pos = 0
-        marker = 0
-        eoc = codepoints.length
-        while(pos < eoc)
-          pos += 1
-          previous = codepoints[pos-1]
-          current = codepoints[pos]
-          if (
-              # CR X LF
-              one = ( previous == UCD.boundary[:cr] and current == UCD.boundary[:lf] ) or
-              # L X (L|V|LV|LVT)
-              two = ( UCD.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or
-              # (LV|V) X (V|T)
-              three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or
-              # (LVT|T) X (T)
-              four = ( in_char_class?(previous, [:lvt,:t]) and UCD.boundary[:t] === current ) or
-              # X Extend
-              five = (UCD.boundary[:extend] === current)
-            )
-          else
-            unpacked << codepoints[marker..pos-1]
-            marker = pos
-          end
-        end 
-        unpacked
-      end
-      
-      # Reverse operation of g_unpack
-      def g_pack(unpacked)
-        unpacked.flatten
-      end
-      
-      # Justifies a string in a certain way. Valid values for <tt>way</tt> are <tt>:right</tt>, <tt>:left</tt> and
-      # <tt>:center</tt>. Is primarily used as a helper method by <tt>rjust</tt>, <tt>ljust</tt> and <tt>center</tt>.
-      def justify(str, integer, way, padstr=' ')
-        raise ArgumentError, "zero width padding" if padstr.length == 0
-        padsize = integer - size(str)
-        padsize = padsize > 0 ? padsize : 0
-        case way
-        when :right
-          str.dup.insert(0, padding(padsize, padstr))
-        when :left
-          str.dup.insert(-1, padding(padsize, padstr))
-        when :center
-          lpad = padding((padsize / 2.0).floor, padstr)
-          rpad = padding((padsize / 2.0).ceil, padstr)
-          str.dup.insert(0, lpad).insert(-1, rpad)
-        end
-      end
-      
-      # Generates a padding string of a certain size.
-      def padding(padsize, padstr=' ')
-        if padsize != 0
-          slice(padstr * ((padsize / size(padstr)) + 1), 0, padsize)
-        else
-          ''
-        end
-      end
-      
-      # Convert characters to a different case
-      def to_case(way, str)
-        u_unpack(str).map do |codepoint|
-          cp = UCD[codepoint] 
-          unless cp.nil?
-            ncp = cp.send(way)
-            ncp > 0 ? ncp : codepoint
-          else
-            codepoint
-          end
-        end.pack('U*')
-      end
-      
-      # Re-order codepoints so the string becomes canonical
-      def reorder_characters(codepoints)
-        length = codepoints.length- 1
-        pos = 0
-        while pos < length do
-          cp1, cp2 = UCD[codepoints[pos]], UCD[codepoints[pos+1]]
-          if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0)
-            codepoints[pos..pos+1] = cp2.code, cp1.code
-            pos += (pos > 0 ? -1 : 1)
-          else
-            pos += 1
-          end
-        end
-        codepoints
-      end
-      
-      # Decompose composed characters to the decomposed form
-      def decompose_codepoints(type, codepoints)
-        codepoints.inject([]) do |decomposed, cp|
-          # if it's a hangul syllable starter character
-          if HANGUL_SBASE <= cp and cp < HANGUL_SLAST
-            sindex = cp - HANGUL_SBASE
-            ncp = [] # new codepoints
-            ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT
-            ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
-            tindex = sindex % HANGUL_TCOUNT
-            ncp << (HANGUL_TBASE + tindex) unless tindex == 0
-            decomposed.concat ncp
-          # if the codepoint is decomposable in with the current decomposition type
-          elsif (ncp = UCD[cp].decomp_mapping) and (!UCD[cp].decomp_type || type == :compatability)
-            decomposed.concat decompose_codepoints(type, ncp.dup)
-          else
-            decomposed << cp
-          end
-        end
-      end
-      
-      # Compose decomposed characters to the composed form
-      def compose_codepoints(codepoints)
-        pos = 0
-        eoa = codepoints.length - 1
-        starter_pos = 0
-        starter_char = codepoints[0]
-        previous_combining_class = -1
-        while pos < eoa
-          pos += 1
-          lindex = starter_char - HANGUL_LBASE
-          # -- Hangul
-          if 0 <= lindex and lindex < HANGUL_LCOUNT
-            vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1
-            if 0 <= vindex and vindex < HANGUL_VCOUNT
-              tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1
-              if 0 <= tindex and tindex < HANGUL_TCOUNT
-                j = starter_pos + 2
-                eoa -= 2
-              else
-                tindex = 0
-                j = starter_pos + 1
-                eoa -= 1
-              end
-              codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE
-            end
-            starter_pos += 1
-            starter_char = codepoints[starter_pos]
-          # -- Other characters
-          else
-            current_char = codepoints[pos]
-            current = UCD[current_char]
-            if current.combining_class > previous_combining_class
-              if ref = UCD.composition_map[starter_char]
-                composition = ref[current_char]
-              else
-                composition = nil
-              end
-              unless composition.nil?
-                codepoints[starter_pos] = composition
-                starter_char = composition
-                codepoints.delete_at pos
-                eoa -= 1
-                pos -= 1
-                previous_combining_class = -1
-              else
-                previous_combining_class = current.combining_class
-              end
-            else
-              previous_combining_class = current.combining_class
-            end
-            if current.combining_class == 0
-              starter_pos = pos
-              starter_char = codepoints[pos]
-            end
-          end
-        end
-        codepoints
-      end
-      
-      # UniCode Database
-      UCD = UnicodeDatabase.new
-    end
-  end
-end
diff --git a/activesupport/lib/active_support/multibyte/handlers/utf8_handler_proc.rb b/activesupport/lib/active_support/multibyte/handlers/utf8_handler_proc.rb
deleted file mode 100644
index f10eecc622..0000000000
--- a/activesupport/lib/active_support/multibyte/handlers/utf8_handler_proc.rb
+++ /dev/null
@@ -1,43 +0,0 @@
-# Methods in this handler call functions in the utf8proc ruby extension. These are significantly faster than the
-# pure ruby versions. Chars automatically uses this handler when it can load the utf8proc extension. For
-# documentation on handler methods see UTF8Handler.
-class ActiveSupport::Multibyte::Handlers::UTF8HandlerProc < ActiveSupport::Multibyte::Handlers::UTF8Handler #:nodoc:
-  class << self
-    def normalize(str, form=ActiveSupport::Multibyte::DEFAULT_NORMALIZATION_FORM) #:nodoc:
-      codepoints = str.unpack('U*')
-      case form
-        when :d
-          utf8map(str, :stable)
-        when :c
-          utf8map(str, :stable, :compose)
-        when :kd
-          utf8map(str, :stable, :compat)
-        when :kc
-          utf8map(str, :stable, :compose, :compat)
-        else
-          raise ArgumentError, "#{form} is not a valid normalization variant", caller
-      end
-    end
-    
-    def decompose(str) #:nodoc:
-      utf8map(str, :stable)
-    end
-    
-    def downcase(str) #:nodoc:c
-      utf8map(str, :casefold)
-    end
-    
-    protected
-    
-    def utf8map(str, *option_array) #:nodoc:
-      options = 0
-      option_array.each do |option|
-        flag = Utf8Proc::Options[option]
-        raise ArgumentError, "Unknown argument given to utf8map." unless
-          flag
-        options |= flag
-      end
-      return Utf8Proc::utf8map(str, options)
-    end
-  end
-end
diff --git a/activesupport/lib/active_support/multibyte/unicode_database.rb b/activesupport/lib/active_support/multibyte/unicode_database.rb
new file mode 100644
index 0000000000..3b8cf8f9eb
--- /dev/null
+++ b/activesupport/lib/active_support/multibyte/unicode_database.rb
@@ -0,0 +1,71 @@
+# encoding: utf-8
+
+module ActiveSupport #:nodoc:
+  module Multibyte #:nodoc:
+    # Holds data about a codepoint in the Unicode database
+    class Codepoint
+      attr_accessor :code, :combining_class, :decomp_type, :decomp_mapping, :uppercase_mapping, :lowercase_mapping
+    end
+
+    # Holds static data from the Unicode database
+    class UnicodeDatabase
+      ATTRIBUTES = :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252
+
+      attr_writer(*ATTRIBUTES)
+
+      def initialize
+        @codepoints = Hash.new(Codepoint.new)
+        @composition_exclusion = []
+        @composition_map = {}
+        @boundary = {}
+        @cp1252 = {}
+      end
+
+      # Lazy load the Unicode database so it's only loaded when it's actually used
+      ATTRIBUTES.each do |attr_name|
+        class_eval(<<-EOS, __FILE__, __LINE__)
+          def #{attr_name}
+            load
+            @#{attr_name}
+          end
+        EOS
+      end
+
+      # Loads the Unicode database and returns all the internal objects of UnicodeDatabase.
+      def load
+        begin
+          @codepoints, @composition_exclusion, @composition_map, @boundary, @cp1252 = File.open(self.class.filename, 'rb') { |f| Marshal.load f.read }
+        rescue Exception => e
+            raise IOError.new("Couldn't load the Unicode tables for UTF8Handler (#{e.message}), ActiveSupport::Multibyte is unusable")
+        end
+
+        # Redefine the === method so we can write shorter rules for grapheme cluster breaks
+        @boundary.each do |k,_|
+          @boundary[k].instance_eval do
+            def ===(other)
+              detect { |i| i === other } ? true : false
+            end
+          end if @boundary[k].kind_of?(Array)
+        end
+
+        # define attr_reader methods for the instance variables
+        class << self
+          attr_reader(*ATTRIBUTES)
+        end
+      end
+
+      # Returns the directory in which the data files are stored
+      def self.dirname
+        File.dirname(__FILE__) + '/../values/'
+      end
+
+      # Returns the filename for the data file for this version
+      def self.filename
+        File.expand_path File.join(dirname, "unicode_tables.dat")
+      end
+    end
+
+    # UniCode Database
+    UCD = UnicodeDatabase.new
+  end
+end
+\ No newline at end of file
author	Manfred Stienstra <manfred@fngtps.com>	2008-09-21 17:21:30 +0200
committer	Manfred Stienstra <manfred@fngtps.com>	2008-09-21 17:21:30 +0200
commit	22f75d539dca7b6f33cbf86e4e9d1944bb22731f (patch)
tree	f3c775cda7f82f5b527864adc363deb3c5eee354 /activesupport/lib/active_support/multibyte
parent	5f83e1844c83c19cf97c6415b943c6ec3cb4bb06 (diff)
download	rails-22f75d539dca7b6f33cbf86e4e9d1944bb22731f.tar.gz rails-22f75d539dca7b6f33cbf86e4e9d1944bb22731f.tar.bz2 rails-22f75d539dca7b6f33cbf86e4e9d1944bb22731f.zip