aboutsummaryrefslogtreecommitdiffstats
path: root/activesupport/lib/active_support/multibyte/chars.rb
diff options
context:
space:
mode:
Diffstat (limited to 'activesupport/lib/active_support/multibyte/chars.rb')
-rw-r--r--activesupport/lib/active_support/multibyte/chars.rb85
1 files changed, 67 insertions, 18 deletions
diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb
index 3eb0bf31f8..38007fd4e7 100644
--- a/activesupport/lib/active_support/multibyte/chars.rb
+++ b/activesupport/lib/active_support/multibyte/chars.rb
@@ -19,7 +19,7 @@ module ActiveSupport #:nodoc:
# bad.explicit_checking_method "T".mb_chars.downcase.to_s
#
# The default Chars implementation assumes that the encoding of the string is UTF-8, if you want to handle different
- # encodings you can write your own multibyte string handler and configure it through
+ # encodings you can write your own multibyte string handler and configure it through
# ActiveSupport::Multibyte.proxy_class.
#
# class CharsForUTF32
@@ -458,8 +458,10 @@ module ActiveSupport #:nodoc:
end
# Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
- def tidy_bytes
- chars(self.class.tidy_bytes(@wrapped_string))
+ #
+ # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1.
+ def tidy_bytes(force = false)
+ chars(self.class.tidy_bytes(@wrapped_string, force))
end
%w(lstrip rstrip strip reverse upcase downcase tidy_bytes capitalize).each do |method|
@@ -528,7 +530,7 @@ module ActiveSupport #:nodoc:
unpacked << codepoints[marker..pos-1]
marker = pos
end
- end
+ end
unpacked
end
@@ -644,33 +646,80 @@ module ActiveSupport #:nodoc:
codepoints
end
+ def tidy_byte(byte)
+ if byte < 160
+ [UCD.cp1252[byte] || byte].pack("U").unpack("C*")
+ elsif byte < 192
+ [194, byte]
+ else
+ [195, byte - 64]
+ end
+ end
+ private :tidy_byte
+
# Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
- def tidy_bytes(string)
- string.split(//u).map do |c|
- c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding)
-
- if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c)
- n = c.unpack('C')[0]
- n < 128 ? n.chr :
- n < 160 ? [UCD.cp1252[n] || n].pack('U') :
- n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr
+ #
+ # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP-1252 or ISO-8859-1.
+ def tidy_bytes(string, force = false)
+ if force
+ return string.unpack("C*").map do |b|
+ tidy_byte(b)
+ end.flatten.compact.pack("C*").unpack("U*").pack("U*")
+ end
+
+ bytes = string.unpack("C*")
+ conts_expected = 0
+ last_lead = 0
+
+ bytes.each_index do |i|
+
+ byte = bytes[i]
+ is_ascii = byte < 128
+ is_cont = byte > 127 && byte < 192
+ is_lead = byte > 191 && byte < 245
+ is_unused = byte > 240
+ is_restricted = byte > 244
+
+ # Impossible or highly unlikely byte? Clean it.
+ if is_unused || is_restricted
+ bytes[i] = tidy_byte(byte)
+ elsif is_cont
+ # Not expecting contination byte? Clean up. Otherwise, now expect one less.
+ conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
else
- c
+ if conts_expected > 0
+ # Expected continuation, but got ASCII or leading? Clean backwards up to
+ # the leading byte.
+ (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
+ conts_expected = 0
+ end
+ if is_lead
+ # Final byte is leading? Clean it.
+ if i == bytes.length - 1
+ bytes[i] = tidy_byte(bytes.last)
+ else
+ # Valid leading byte? Expect continuations determined by position of
+ # first zero bit, with max of 3.
+ conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
+ last_lead = i
+ end
+ end
end
- end.join
+ end
+ bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
end
end
protected
-
+
def translate_offset(byte_offset) #:nodoc:
return nil if byte_offset.nil?
return 0 if @wrapped_string == ''
-
+
if @wrapped_string.respond_to?(:force_encoding)
@wrapped_string = @wrapped_string.dup.force_encoding(Encoding::ASCII_8BIT)
end
-
+
begin
@wrapped_string[0...byte_offset].unpack('U*').length
rescue ArgumentError => e