diff options
author | Jeremy Kemper <jeremy@bitsweat.net> | 2013-12-26 12:14:30 -0800 |
---|---|---|
committer | Jeremy Kemper <jeremy@bitsweat.net> | 2013-12-26 12:14:30 -0800 |
commit | 037665cbe1f7705afaeb924e34eeb0d93f60b290 (patch) | |
tree | a821d4b438106f2bbe5f58d2995f48f09753f39d | |
parent | cef8c8f772cf7a853a1c447f02baecfdde9fa237 (diff) | |
parent | ab195841ddc7302ca6e6fc4a5962bc5ab3b8c09b (diff) | |
download | rails-037665cbe1f7705afaeb924e34eeb0d93f60b290.tar.gz rails-037665cbe1f7705afaeb924e34eeb0d93f60b290.tar.bz2 rails-037665cbe1f7705afaeb924e34eeb0d93f60b290.zip |
Merge pull request #13495 from norman/tidy_bytes
Use String#scrub when available to tidy bytes
-rw-r--r-- | activesupport/lib/active_support/multibyte/unicode.rb | 70 |
1 files changed, 35 insertions, 35 deletions
diff --git a/activesupport/lib/active_support/multibyte/unicode.rb b/activesupport/lib/active_support/multibyte/unicode.rb index 1845c6ae38..5fd410fbe1 100644 --- a/activesupport/lib/active_support/multibyte/unicode.rb +++ b/activesupport/lib/active_support/multibyte/unicode.rb @@ -212,37 +212,43 @@ module ActiveSupport codepoints end - # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent - # resulting in a valid UTF-8 string. - # - # Passing +true+ will forcibly tidy all bytes, assuming that the string's - # encoding is entirely CP1252 or ISO-8859-1. - def tidy_bytes(string, force = false) - return string if string.empty? - - if force - return string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace) + # Ruby >= 2.1 has String#scrub, which is faster than the workaround used for < 2.1. + if RUBY_VERSION >= '2.1' + # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent + # resulting in a valid UTF-8 string. + # + # Passing +true+ will forcibly tidy all bytes, assuming that the string's + # encoding is entirely CP1252 or ISO-8859-1. + def tidy_bytes(string, force = false) + return string if string.empty? + return recode_windows1252_chars(string) if force + string.scrub { |bad| recode_windows1252_chars(bad) } end + else + def tidy_bytes(string, force = false) + return string if string.empty? + return recode_windows1252_chars(string) if force + + # We can't transcode to the same format, so we choose a nearly-identical encoding. + # We're going to 'transcode' bytes from UTF-8 when possible, then fall back to + # CP1252 when we get errors. The final string will be 'converted' back to UTF-8 + # before returning. + reader = Encoding::Converter.new(Encoding::UTF_8, Encoding::UTF_8_MAC) + + source = string.dup + out = ''.force_encoding(Encoding::UTF_8_MAC) + + loop do + reader.primitive_convert(source, out) + _, _, _, error_bytes, _ = reader.primitive_errinfo + break if error_bytes.nil? + out << error_bytes.encode(Encoding::UTF_8_MAC, Encoding::Windows_1252, invalid: :replace, undef: :replace) + end - # We can't transcode to the same format, so we choose a nearly-identical encoding. - # We're going to 'transcode' bytes from UTF-8 when possible, then fall back to - # CP1252 when we get errors. The final string will be 'converted' back to UTF-8 - # before returning. - reader = Encoding::Converter.new(Encoding::UTF_8, Encoding::UTF_8_MAC) - - source = string.dup - out = ''.force_encoding(Encoding::UTF_8_MAC) + reader.finish - loop do - reader.primitive_convert(source, out) - _, _, _, error_bytes, _ = reader.primitive_errinfo - break if error_bytes.nil? - out << error_bytes.encode(Encoding::UTF_8_MAC, Encoding::Windows_1252, invalid: :replace, undef: :replace) + out.encode!(Encoding::UTF_8) end - - reader.finish - - out.encode!(Encoding::UTF_8) end # Returns the KC normalization of the string by default. NFKC is @@ -371,14 +377,8 @@ module ActiveSupport end.pack('U*') end - def tidy_byte(byte) - if byte < 160 - [database.cp1252[byte] || byte].pack("U").unpack("C*") - elsif byte < 192 - [194, byte] - else - [195, byte - 64] - end + def recode_windows1252_chars(string) + string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace) end def database |