aboutsummaryrefslogtreecommitdiffstats
path: root/activesupport/lib/active_support/multibyte/unicode.rb
diff options
context:
space:
mode:
Diffstat (limited to 'activesupport/lib/active_support/multibyte/unicode.rb')
-rw-r--r--activesupport/lib/active_support/multibyte/unicode.rb105
1 files changed, 46 insertions, 59 deletions
diff --git a/activesupport/lib/active_support/multibyte/unicode.rb b/activesupport/lib/active_support/multibyte/unicode.rb
index cbc1608349..84799c2399 100644
--- a/activesupport/lib/active_support/multibyte/unicode.rb
+++ b/activesupport/lib/active_support/multibyte/unicode.rb
@@ -11,7 +11,7 @@ module ActiveSupport
NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
# The Unicode version that is supported by the implementation
- UNICODE_VERSION = '6.2.0'
+ UNICODE_VERSION = '6.3.0'
# The default normalization used for operations that require
# normalization. It can be set to any of the normalizations
@@ -145,7 +145,7 @@ module ActiveSupport
ncp << (HANGUL_TBASE + tindex) unless tindex == 0
decomposed.concat ncp
# if the codepoint is decomposable in with the current decomposition type
- elsif (ncp = database.codepoints[cp].decomp_mapping) and (!database.codepoints[cp].decomp_type || type == :compatability)
+ elsif (ncp = database.codepoints[cp].decomp_mapping) and (!database.codepoints[cp].decomp_type || type == :compatibility)
decomposed.concat decompose(type, ncp.dup)
else
decomposed << cp
@@ -212,57 +212,43 @@ module ActiveSupport
codepoints
end
- # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent
- # resulting in a valid UTF-8 string.
- #
- # Passing +true+ will forcibly tidy all bytes, assuming that the string's
- # encoding is entirely CP1252 or ISO-8859-1.
- def tidy_bytes(string, force = false)
- if force
- return string.unpack("C*").map do |b|
- tidy_byte(b)
- end.flatten.compact.pack("C*").unpack("U*").pack("U*")
+ # Ruby >= 2.1 has String#scrub, which is faster than the workaround used for < 2.1.
+ if RUBY_VERSION >= '2.1'
+ # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent
+ # resulting in a valid UTF-8 string.
+ #
+ # Passing +true+ will forcibly tidy all bytes, assuming that the string's
+ # encoding is entirely CP1252 or ISO-8859-1.
+ def tidy_bytes(string, force = false)
+ return string if string.empty?
+ return recode_windows1252_chars(string) if force
+ string.scrub { |bad| recode_windows1252_chars(bad) }
end
+ else
+ def tidy_bytes(string, force = false)
+ return string if string.empty?
+ return recode_windows1252_chars(string) if force
+
+ # We can't transcode to the same format, so we choose a nearly-identical encoding.
+ # We're going to 'transcode' bytes from UTF-8 when possible, then fall back to
+ # CP1252 when we get errors. The final string will be 'converted' back to UTF-8
+ # before returning.
+ reader = Encoding::Converter.new(Encoding::UTF_8, Encoding::UTF_8_MAC)
+
+ source = string.dup
+ out = ''.force_encoding(Encoding::UTF_8_MAC)
+
+ loop do
+ reader.primitive_convert(source, out)
+ _, _, _, error_bytes, _ = reader.primitive_errinfo
+ break if error_bytes.nil?
+ out << error_bytes.encode(Encoding::UTF_8_MAC, Encoding::Windows_1252, invalid: :replace, undef: :replace)
+ end
- bytes = string.unpack("C*")
- conts_expected = 0
- last_lead = 0
-
- bytes.each_index do |i|
-
- byte = bytes[i]
- is_cont = byte > 127 && byte < 192
- is_lead = byte > 191 && byte < 245
- is_unused = byte > 240
- is_restricted = byte > 244
+ reader.finish
- # Impossible or highly unlikely byte? Clean it.
- if is_unused || is_restricted
- bytes[i] = tidy_byte(byte)
- elsif is_cont
- # Not expecting continuation byte? Clean up. Otherwise, now expect one less.
- conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
- else
- if conts_expected > 0
- # Expected continuation, but got ASCII or leading? Clean backwards up to
- # the leading byte.
- (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
- conts_expected = 0
- end
- if is_lead
- # Final byte is leading? Clean it.
- if i == bytes.length - 1
- bytes[i] = tidy_byte(bytes.last)
- else
- # Valid leading byte? Expect continuations determined by position of
- # first zero bit, with max of 3.
- conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
- last_lead = i
- end
- end
- end
+ out.encode!(Encoding::UTF_8)
end
- bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
end
# Returns the KC normalization of the string by default. NFKC is
@@ -283,9 +269,9 @@ module ActiveSupport
when :c
compose(reorder_characters(decompose(:canonical, codepoints)))
when :kd
- reorder_characters(decompose(:compatability, codepoints))
+ reorder_characters(decompose(:compatibility, codepoints))
when :kc
- compose(reorder_characters(decompose(:compatability, codepoints)))
+ compose(reorder_characters(decompose(:compatibility, codepoints)))
else
raise ArgumentError, "#{form} is not a valid normalization variant", caller
end.pack('U*')
@@ -307,6 +293,13 @@ module ActiveSupport
class Codepoint
attr_accessor :code, :combining_class, :decomp_type, :decomp_mapping, :uppercase_mapping, :lowercase_mapping
+ # Initializing Codepoint object with default values
+ def initialize
+ @combining_class = 0
+ @uppercase_mapping = 0
+ @lowercase_mapping = 0
+ end
+
def swapcase_mapping
uppercase_mapping > 0 ? uppercase_mapping : lowercase_mapping
end
@@ -384,14 +377,8 @@ module ActiveSupport
end.pack('U*')
end
- def tidy_byte(byte)
- if byte < 160
- [database.cp1252[byte] || byte].pack("U").unpack("C*")
- elsif byte < 192
- [194, byte]
- else
- [195, byte - 64]
- end
+ def recode_windows1252_chars(string)
+ string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
end
def database