aboutsummaryrefslogtreecommitdiffstats
path: root/activesupport
diff options
context:
space:
mode:
authorCliff Pruitt <cliff.pruitt@cliffpruitt.com>2019-07-26 11:02:40 -0400
committerCliff Pruitt <cliff.pruitt@cliffpruitt.com>2019-07-26 12:18:01 -0400
commit05633d02d8ac8aa1289c0a01872e13e9b2449cd5 (patch)
tree952cb2760ca0877ce37b9449f930850ef67b5469 /activesupport
parent369daa530dd9db7bbba79b8c75012e0fa84c9f48 (diff)
downloadrails-05633d02d8ac8aa1289c0a01872e13e9b2449cd5.tar.gz
rails-05633d02d8ac8aa1289c0a01872e13e9b2449cd5.tar.bz2
rails-05633d02d8ac8aa1289c0a01872e13e9b2449cd5.zip
Handle GB18030 strings with invalid characters in transliterate
GB18030 is Unicode compatible and covers all Unicode code points so we can temporarily convert GB18030 strings to UTF-8 to perform the transliteration. After transliterating we want to convert back to GB18030. In all cases of transcoding, we replace invalid or undefined characters with the default replacement character ("?"). This is in line with the behavior of tidy_bytes which is used on the UTF-8 string before transliterating.
Diffstat (limited to 'activesupport')
-rw-r--r--activesupport/lib/active_support/inflector/transliterate.rb20
-rw-r--r--activesupport/test/transliterate_test.rb11
2 files changed, 18 insertions, 13 deletions
diff --git a/activesupport/lib/active_support/inflector/transliterate.rb b/activesupport/lib/active_support/inflector/transliterate.rb
index a6f57d73ac..0751f8a3ad 100644
--- a/activesupport/lib/active_support/inflector/transliterate.rb
+++ b/activesupport/lib/active_support/inflector/transliterate.rb
@@ -57,8 +57,8 @@ module ActiveSupport
# transliterate('Jürgen', locale: :de)
# # => "Juergen"
#
- # Transliteration of ASCII-8BIT / BINARY strings is not
- # supported and will raise an ArgumentError.
+ # Transliteration is restricted to UTF-8, US-ASCII and GB18030 strings
+ # Other encodings will raise an ArgumentError.
def transliterate(string, replacement = "?", locale: nil)
raise ArgumentError, "Can only transliterate strings. Received #{string.class.name}" unless string.is_a?(String)
@@ -67,18 +67,26 @@ module ActiveSupport
input_encoding = string.encoding
- # US-ASCII is a subset so we'll force encoding as UTF-8 if US-ASCII is given
- # This way we can hancle invalid bytes in the same way as we do for UTF-8
+ # US-ASCII is a subset of UTF-8 so we'll force encoding as UTF-8 if
+ # US-ASCII is given. This way we can let tidy_bytes handle the string
+ # in the same way as we do for UTF-8
string.force_encoding(Encoding::UTF_8) if string.encoding == Encoding::US_ASCII
+ # GB18030 is Unicode compatible but is not a direct mapping so needs to be
+ # transcoded. Using invalid/undef :replace will result in loss of data in
+ # the event of invalid characters, but since tidy_bytes will replace
+ # invalid/undef with a "?" we're safe to do the same beforehand
+ string.encode!(Encoding::UTF_8, invalid: :replace, undef: :replace) if string.encoding == Encoding::GB18030
+
transliterated = I18n.transliterate(
ActiveSupport::Multibyte::Unicode.tidy_bytes(string).unicode_normalize(:nfc),
replacement: replacement,
locale: locale
)
- # If we were given US-ASCII we give back US-ASCII
- transliterated.force_encoding(Encoding::US_ASCII) if input_encoding == Encoding::US_ASCII
+ # Restore the string encoding of the input if it was not UTF-8.
+ # Apply invalid/undef :replace as tidy_bytes does
+ transliterated.encode!(input_encoding, invalid: :replace, undef: :replace) if input_encoding != transliterated.encoding
transliterated
end
diff --git a/activesupport/test/transliterate_test.rb b/activesupport/test/transliterate_test.rb
index 47830946bf..ab7ffcaed0 100644
--- a/activesupport/test/transliterate_test.rb
+++ b/activesupport/test/transliterate_test.rb
@@ -75,7 +75,9 @@ class TransliterateTest < ActiveSupport::TestCase
# Valid GB18030 Works
def test_transliterate_handles_strings_with_valid_gb18030_encodings
string = String.new("A", encoding: Encoding::GB18030)
- assert_equal "A", ActiveSupport::Inflector.transliterate(string)
+ transcoded = ActiveSupport::Inflector.transliterate(string)
+ assert_equal "A", transcoded
+ assert_equal Encoding::GB18030, transcoded.encoding
end
# All other encodings raise argument errors
@@ -103,17 +105,12 @@ class TransliterateTest < ActiveSupport::TestCase
# Invalid raises exception
def test_transliterate_handles_strings_with_invalid_us_ascii_bytes
string = String.new("\255", encoding: Encoding::US_ASCII)
- # exception = assert_raises Encoding::CompatibilityError do
- # ActiveSupport::Inflector.transliterate(string)
- # end
assert_equal "?", ActiveSupport::Inflector.transliterate(string)
end
# Invalid GB18030 raises exception
def test_transliterate_handles_strings_with_invalid_gb18030_bytes
string = String.new("\255", encoding: Encoding::GB18030)
- exception = assert_raises Encoding::CompatibilityError do
- ActiveSupport::Inflector.transliterate(string)
- end
+ assert_equal "?", ActiveSupport::Inflector.transliterate(string)
end
end