diff options
author | Rafael França <rafael@franca.dev> | 2019-07-26 14:20:06 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-07-26 14:20:06 -0400 |
commit | 2c39978ffca683445c19351757f5cdb79af62575 (patch) | |
tree | 4642c3d05a1427bc8701673df3985bcc954c49d5 /activesupport | |
parent | ea4305109e6dd2833e6e81fe8e70e0b6951a3c9d (diff) | |
parent | a7b6a9553bfb0a84546b0b74c71efc0688881127 (diff) | |
download | rails-2c39978ffca683445c19351757f5cdb79af62575.tar.gz rails-2c39978ffca683445c19351757f5cdb79af62575.tar.bz2 rails-2c39978ffca683445c19351757f5cdb79af62575.zip |
Merge pull request #36702 from cpruitt/raise-on-transliterate-ascii-8bit
Handle invalid string encodings and characters in ActiveSupport::Inflector.transliterate
Diffstat (limited to 'activesupport')
-rw-r--r-- | activesupport/lib/active_support/inflector/transliterate.rb | 27 | ||||
-rw-r--r-- | activesupport/test/transliterate_test.rb | 49 |
2 files changed, 75 insertions, 1 deletions
diff --git a/activesupport/lib/active_support/inflector/transliterate.rb b/activesupport/lib/active_support/inflector/transliterate.rb index ec6e9ccb59..0751f8a3ad 100644 --- a/activesupport/lib/active_support/inflector/transliterate.rb +++ b/activesupport/lib/active_support/inflector/transliterate.rb @@ -56,14 +56,39 @@ module ActiveSupport # # transliterate('Jürgen', locale: :de) # # => "Juergen" + # + # Transliteration is restricted to UTF-8, US-ASCII and GB18030 strings + # Other encodings will raise an ArgumentError. def transliterate(string, replacement = "?", locale: nil) raise ArgumentError, "Can only transliterate strings. Received #{string.class.name}" unless string.is_a?(String) - I18n.transliterate( + allowed_encodings = [Encoding::UTF_8, Encoding::US_ASCII, Encoding::GB18030] + raise ArgumentError, "Can not transliterate strings with #{string.encoding} encoding" unless allowed_encodings.include?(string.encoding) + + input_encoding = string.encoding + + # US-ASCII is a subset of UTF-8 so we'll force encoding as UTF-8 if + # US-ASCII is given. This way we can let tidy_bytes handle the string + # in the same way as we do for UTF-8 + string.force_encoding(Encoding::UTF_8) if string.encoding == Encoding::US_ASCII + + # GB18030 is Unicode compatible but is not a direct mapping so needs to be + # transcoded. Using invalid/undef :replace will result in loss of data in + # the event of invalid characters, but since tidy_bytes will replace + # invalid/undef with a "?" we're safe to do the same beforehand + string.encode!(Encoding::UTF_8, invalid: :replace, undef: :replace) if string.encoding == Encoding::GB18030 + + transliterated = I18n.transliterate( ActiveSupport::Multibyte::Unicode.tidy_bytes(string).unicode_normalize(:nfc), replacement: replacement, locale: locale ) + + # Restore the string encoding of the input if it was not UTF-8. + # Apply invalid/undef :replace as tidy_bytes does + transliterated.encode!(input_encoding, invalid: :replace, undef: :replace) if input_encoding != transliterated.encoding + + transliterated end # Replaces special characters in a string so that it may be used as part of diff --git a/activesupport/test/transliterate_test.rb b/activesupport/test/transliterate_test.rb index 9e29a93ea0..2e02b5e938 100644 --- a/activesupport/test/transliterate_test.rb +++ b/activesupport/test/transliterate_test.rb @@ -57,4 +57,53 @@ class TransliterateTest < ActiveSupport::TestCase end assert_equal "Can only transliterate strings. Received Object", exception.message end + + def test_transliterate_handles_strings_with_valid_utf8_encodings + string = String.new("A", encoding: Encoding::UTF_8) + assert_equal "A", ActiveSupport::Inflector.transliterate(string) + end + + def test_transliterate_handles_strings_with_valid_us_ascii_encodings + string = String.new("A", encoding: Encoding::US_ASCII) + transcoded = ActiveSupport::Inflector.transliterate(string) + assert_equal "A", transcoded + assert_equal Encoding::US_ASCII, transcoded.encoding + end + + def test_transliterate_handles_strings_with_valid_gb18030_encodings + string = String.new("A", encoding: Encoding::GB18030) + transcoded = ActiveSupport::Inflector.transliterate(string) + assert_equal "A", transcoded + assert_equal Encoding::GB18030, transcoded.encoding + end + + def test_transliterate_handles_strings_with_incompatible_encodings + incompatible_encodings = Encoding.list - [ + Encoding::UTF_8, + Encoding::US_ASCII, + Encoding::GB18030 + ] + incompatible_encodings.each do |encoding| + string = String.new("", encoding: encoding) + exception = assert_raises ArgumentError do + ActiveSupport::Inflector.transliterate(string) + end + assert_equal "Can not transliterate strings with #{encoding} encoding", exception.message + end + end + + def test_transliterate_handles_strings_with_invalid_utf8_bytes + string = String.new("\255", encoding: Encoding::UTF_8) + assert_equal "?", ActiveSupport::Inflector.transliterate(string) + end + + def test_transliterate_handles_strings_with_invalid_us_ascii_bytes + string = String.new("\255", encoding: Encoding::US_ASCII) + assert_equal "?", ActiveSupport::Inflector.transliterate(string) + end + + def test_transliterate_handles_strings_with_invalid_gb18030_bytes + string = String.new("\255", encoding: Encoding::GB18030) + assert_equal "?", ActiveSupport::Inflector.transliterate(string) + end end |