From 369daa530dd9db7bbba79b8c75012e0fa84c9f48 Mon Sep 17 00:00:00 2001 From: Cliff Pruitt Date: Thu, 25 Jul 2019 14:51:42 -0400 Subject: Handle US-ASCII strings with invalid characters in transliterate US-ASCII is a subset of UTF-8 so we can temporarily convert US-ASCII strings to UTF-8 to perform the transliteration. After we've converted characters to ASCII representations, we can set the encoding back to US-ASCII to return the same encoding we accepted. --- .../lib/active_support/inflector/transliterate.rb | 17 +++++++++++-- activesupport/test/transliterate_test.rb | 28 +++++++++------------- 2 files changed, 26 insertions(+), 19 deletions(-) (limited to 'activesupport') diff --git a/activesupport/lib/active_support/inflector/transliterate.rb b/activesupport/lib/active_support/inflector/transliterate.rb index 61651ba101..a6f57d73ac 100644 --- a/activesupport/lib/active_support/inflector/transliterate.rb +++ b/activesupport/lib/active_support/inflector/transliterate.rb @@ -61,13 +61,26 @@ module ActiveSupport # supported and will raise an ArgumentError. def transliterate(string, replacement = "?", locale: nil) raise ArgumentError, "Can only transliterate strings. Received #{string.class.name}" unless string.is_a?(String) - raise ArgumentError, "Can not transliterate strings with ASCII-8BIT encoding" if string.encoding == ::Encoding::ASCII_8BIT - I18n.transliterate( + allowed_encodings = [Encoding::UTF_8, Encoding::US_ASCII, Encoding::GB18030] + raise ArgumentError, "Can not transliterate strings with #{string.encoding} encoding" unless allowed_encodings.include?(string.encoding) + + input_encoding = string.encoding + + # US-ASCII is a subset so we'll force encoding as UTF-8 if US-ASCII is given + # This way we can hancle invalid bytes in the same way as we do for UTF-8 + string.force_encoding(Encoding::UTF_8) if string.encoding == Encoding::US_ASCII + + transliterated = I18n.transliterate( ActiveSupport::Multibyte::Unicode.tidy_bytes(string).unicode_normalize(:nfc), replacement: replacement, locale: locale ) + + # If we were given US-ASCII we give back US-ASCII + transliterated.force_encoding(Encoding::US_ASCII) if input_encoding == Encoding::US_ASCII + + transliterated end # Replaces special characters in a string so that it may be used as part of diff --git a/activesupport/test/transliterate_test.rb b/activesupport/test/transliterate_test.rb index 4101e4878a..47830946bf 100644 --- a/activesupport/test/transliterate_test.rb +++ b/activesupport/test/transliterate_test.rb @@ -67,7 +67,9 @@ class TransliterateTest < ActiveSupport::TestCase # Valid US-ASCII Works def test_transliterate_handles_strings_with_valid_us_ascii_encodings string = String.new("A", encoding: Encoding::US_ASCII) - assert_equal "A", ActiveSupport::Inflector.transliterate(string) + transcoded = ActiveSupport::Inflector.transliterate(string) + assert_equal "A", transcoded + assert_equal Encoding::US_ASCII, transcoded.encoding end # Valid GB18030 Works @@ -76,20 +78,19 @@ class TransliterateTest < ActiveSupport::TestCase assert_equal "A", ActiveSupport::Inflector.transliterate(string) end - # All other encodings raise exceptions + # All other encodings raise argument errors def test_transliterate_handles_strings_with_incompatible_encodings incompatible_encodings = Encoding.list - [ Encoding::UTF_8, Encoding::US_ASCII, - Encoding::GB18030, + Encoding::GB18030 ] - # This Raises an argument error - incompatible_encodings -= [Encoding::ASCII_8BIT] incompatible_encodings.each do |encoding| string = String.new("", encoding: encoding) - exception = assert_raises Encoding::CompatibilityError do + exception = assert_raises ArgumentError do ActiveSupport::Inflector.transliterate(string) end + assert_equal "Can not transliterate strings with #{encoding} encoding", exception.message end end @@ -102,9 +103,10 @@ class TransliterateTest < ActiveSupport::TestCase # Invalid raises exception def test_transliterate_handles_strings_with_invalid_us_ascii_bytes string = String.new("\255", encoding: Encoding::US_ASCII) - exception = assert_raises Encoding::CompatibilityError do - ActiveSupport::Inflector.transliterate(string) - end + # exception = assert_raises Encoding::CompatibilityError do + # ActiveSupport::Inflector.transliterate(string) + # end + assert_equal "?", ActiveSupport::Inflector.transliterate(string) end # Invalid GB18030 raises exception @@ -114,12 +116,4 @@ class TransliterateTest < ActiveSupport::TestCase ActiveSupport::Inflector.transliterate(string) end end - - def test_transliterate_handles_ascci_8bit_strings - ascii_8bit_string = "A".b - exception = assert_raises ArgumentError do - ActiveSupport::Inflector.transliterate(ascii_8bit_string) - end - assert_equal "Can not transliterate strings with ASCII-8BIT encoding", exception.message - end end -- cgit v1.2.3