diff options
Diffstat (limited to 'activesupport/lib/active_support/multibyte/unicode.rb')
-rw-r--r-- | activesupport/lib/active_support/multibyte/unicode.rb | 73 |
1 files changed, 30 insertions, 43 deletions
diff --git a/activesupport/lib/active_support/multibyte/unicode.rb b/activesupport/lib/active_support/multibyte/unicode.rb index f49ca47f14..1845c6ae38 100644 --- a/activesupport/lib/active_support/multibyte/unicode.rb +++ b/activesupport/lib/active_support/multibyte/unicode.rb @@ -11,7 +11,7 @@ module ActiveSupport NORMALIZATION_FORMS = [:c, :kc, :d, :kd] # The Unicode version that is supported by the implementation - UNICODE_VERSION = '6.1.0' + UNICODE_VERSION = '6.2.0' # The default normalization used for operations that require # normalization. It can be set to any of the normalizations @@ -145,7 +145,7 @@ module ActiveSupport ncp << (HANGUL_TBASE + tindex) unless tindex == 0 decomposed.concat ncp # if the codepoint is decomposable in with the current decomposition type - elsif (ncp = database.codepoints[cp].decomp_mapping) and (!database.codepoints[cp].decomp_type || type == :compatability) + elsif (ncp = database.codepoints[cp].decomp_mapping) and (!database.codepoints[cp].decomp_type || type == :compatibility) decomposed.concat decompose(type, ncp.dup) else decomposed << cp @@ -218,51 +218,31 @@ module ActiveSupport # Passing +true+ will forcibly tidy all bytes, assuming that the string's # encoding is entirely CP1252 or ISO-8859-1. def tidy_bytes(string, force = false) + return string if string.empty? + if force - return string.unpack("C*").map do |b| - tidy_byte(b) - end.flatten.compact.pack("C*").unpack("U*").pack("U*") + return string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace) end - bytes = string.unpack("C*") - conts_expected = 0 - last_lead = 0 - - bytes.each_index do |i| + # We can't transcode to the same format, so we choose a nearly-identical encoding. + # We're going to 'transcode' bytes from UTF-8 when possible, then fall back to + # CP1252 when we get errors. The final string will be 'converted' back to UTF-8 + # before returning. + reader = Encoding::Converter.new(Encoding::UTF_8, Encoding::UTF_8_MAC) - byte = bytes[i] - is_cont = byte > 127 && byte < 192 - is_lead = byte > 191 && byte < 245 - is_unused = byte > 240 - is_restricted = byte > 244 + source = string.dup + out = ''.force_encoding(Encoding::UTF_8_MAC) - # Impossible or highly unlikely byte? Clean it. - if is_unused || is_restricted - bytes[i] = tidy_byte(byte) - elsif is_cont - # Not expecting continuation byte? Clean up. Otherwise, now expect one less. - conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1 - else - if conts_expected > 0 - # Expected continuation, but got ASCII or leading? Clean backwards up to - # the leading byte. - (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])} - conts_expected = 0 - end - if is_lead - # Final byte is leading? Clean it. - if i == bytes.length - 1 - bytes[i] = tidy_byte(bytes.last) - else - # Valid leading byte? Expect continuations determined by position of - # first zero bit, with max of 3. - conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3 - last_lead = i - end - end - end + loop do + reader.primitive_convert(source, out) + _, _, _, error_bytes, _ = reader.primitive_errinfo + break if error_bytes.nil? + out << error_bytes.encode(Encoding::UTF_8_MAC, Encoding::Windows_1252, invalid: :replace, undef: :replace) end - bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") + + reader.finish + + out.encode!(Encoding::UTF_8) end # Returns the KC normalization of the string by default. NFKC is @@ -283,9 +263,9 @@ module ActiveSupport when :c compose(reorder_characters(decompose(:canonical, codepoints))) when :kd - reorder_characters(decompose(:compatability, codepoints)) + reorder_characters(decompose(:compatibility, codepoints)) when :kc - compose(reorder_characters(decompose(:compatability, codepoints))) + compose(reorder_characters(decompose(:compatibility, codepoints))) else raise ArgumentError, "#{form} is not a valid normalization variant", caller end.pack('U*') @@ -307,6 +287,13 @@ module ActiveSupport class Codepoint attr_accessor :code, :combining_class, :decomp_type, :decomp_mapping, :uppercase_mapping, :lowercase_mapping + # Initializing Codepoint object with default values + def initialize + @combining_class = 0 + @uppercase_mapping = 0 + @lowercase_mapping = 0 + end + def swapcase_mapping uppercase_mapping > 0 ? uppercase_mapping : lowercase_mapping end |