From 5b0a805c6a9f22818c17ddd5641b9480ea1d668d Mon Sep 17 00:00:00 2001 From: Cliff Pruitt Date: Wed, 17 Jul 2019 14:12:41 -0400 Subject: Raise errors for ASCII-8BIT encoding in ActiveSupport::Inflector::transliterate Adds ArgumentErrors to `ActiveSupport::Inflector::transliterate` if a string is with ASCII-8BIT which will raise an error in `unicode_normalize`. --- activesupport/lib/active_support/inflector/transliterate.rb | 4 ++++ activesupport/test/transliterate_test.rb | 8 ++++++++ 2 files changed, 12 insertions(+) (limited to 'activesupport') diff --git a/activesupport/lib/active_support/inflector/transliterate.rb b/activesupport/lib/active_support/inflector/transliterate.rb index ec6e9ccb59..61651ba101 100644 --- a/activesupport/lib/active_support/inflector/transliterate.rb +++ b/activesupport/lib/active_support/inflector/transliterate.rb @@ -56,8 +56,12 @@ module ActiveSupport # # transliterate('Jürgen', locale: :de) # # => "Juergen" + # + # Transliteration of ASCII-8BIT / BINARY strings is not + # supported and will raise an ArgumentError. def transliterate(string, replacement = "?", locale: nil) raise ArgumentError, "Can only transliterate strings. Received #{string.class.name}" unless string.is_a?(String) + raise ArgumentError, "Can not transliterate strings with ASCII-8BIT encoding" if string.encoding == ::Encoding::ASCII_8BIT I18n.transliterate( ActiveSupport::Multibyte::Unicode.tidy_bytes(string).unicode_normalize(:nfc), diff --git a/activesupport/test/transliterate_test.rb b/activesupport/test/transliterate_test.rb index 9e29a93ea0..620bb20305 100644 --- a/activesupport/test/transliterate_test.rb +++ b/activesupport/test/transliterate_test.rb @@ -57,4 +57,12 @@ class TransliterateTest < ActiveSupport::TestCase end assert_equal "Can only transliterate strings. Received Object", exception.message end + + def test_transliterate_handles_ascci_8bit_strings + ascii_8bit_string = "A".b + exception = assert_raises ArgumentError do + ActiveSupport::Inflector.transliterate(ascii_8bit_string) + end + assert_equal "Can not transliterate strings with ASCII-8BIT encoding", exception.message + end end -- cgit v1.2.3 From 0cdaa38dd190cae1f13f0f22ab3d1002b6a82081 Mon Sep 17 00:00:00 2001 From: Cliff Pruitt Date: Fri, 19 Jul 2019 15:16:23 -0400 Subject: Add encoding tests for ActiveSupport::Inflector.transliterate --- activesupport/test/transliterate_test.rb | 57 ++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'activesupport') diff --git a/activesupport/test/transliterate_test.rb b/activesupport/test/transliterate_test.rb index 620bb20305..4101e4878a 100644 --- a/activesupport/test/transliterate_test.rb +++ b/activesupport/test/transliterate_test.rb @@ -58,6 +58,63 @@ class TransliterateTest < ActiveSupport::TestCase assert_equal "Can only transliterate strings. Received Object", exception.message end + # Valid UTF-8 Works + def test_transliterate_handles_strings_with_valid_utf8_encodings + string = String.new("A", encoding: Encoding::UTF_8) + assert_equal "A", ActiveSupport::Inflector.transliterate(string) + end + + # Valid US-ASCII Works + def test_transliterate_handles_strings_with_valid_us_ascii_encodings + string = String.new("A", encoding: Encoding::US_ASCII) + assert_equal "A", ActiveSupport::Inflector.transliterate(string) + end + + # Valid GB18030 Works + def test_transliterate_handles_strings_with_valid_gb18030_encodings + string = String.new("A", encoding: Encoding::GB18030) + assert_equal "A", ActiveSupport::Inflector.transliterate(string) + end + + # All other encodings raise exceptions + def test_transliterate_handles_strings_with_incompatible_encodings + incompatible_encodings = Encoding.list - [ + Encoding::UTF_8, + Encoding::US_ASCII, + Encoding::GB18030, + ] + # This Raises an argument error + incompatible_encodings -= [Encoding::ASCII_8BIT] + incompatible_encodings.each do |encoding| + string = String.new("", encoding: encoding) + exception = assert_raises Encoding::CompatibilityError do + ActiveSupport::Inflector.transliterate(string) + end + end + end + + # Invalid UTF-8 Works + def test_transliterate_handles_strings_with_invalid_utf8_bytes + string = String.new("\255", encoding: Encoding::UTF_8) + assert_equal "?", ActiveSupport::Inflector.transliterate(string) + end + + # Invalid raises exception + def test_transliterate_handles_strings_with_invalid_us_ascii_bytes + string = String.new("\255", encoding: Encoding::US_ASCII) + exception = assert_raises Encoding::CompatibilityError do + ActiveSupport::Inflector.transliterate(string) + end + end + + # Invalid GB18030 raises exception + def test_transliterate_handles_strings_with_invalid_gb18030_bytes + string = String.new("\255", encoding: Encoding::GB18030) + exception = assert_raises Encoding::CompatibilityError do + ActiveSupport::Inflector.transliterate(string) + end + end + def test_transliterate_handles_ascci_8bit_strings ascii_8bit_string = "A".b exception = assert_raises ArgumentError do -- cgit v1.2.3 From 369daa530dd9db7bbba79b8c75012e0fa84c9f48 Mon Sep 17 00:00:00 2001 From: Cliff Pruitt Date: Thu, 25 Jul 2019 14:51:42 -0400 Subject: Handle US-ASCII strings with invalid characters in transliterate US-ASCII is a subset of UTF-8 so we can temporarily convert US-ASCII strings to UTF-8 to perform the transliteration. After we've converted characters to ASCII representations, we can set the encoding back to US-ASCII to return the same encoding we accepted. --- .../lib/active_support/inflector/transliterate.rb | 17 +++++++++++-- activesupport/test/transliterate_test.rb | 28 +++++++++------------- 2 files changed, 26 insertions(+), 19 deletions(-) (limited to 'activesupport') diff --git a/activesupport/lib/active_support/inflector/transliterate.rb b/activesupport/lib/active_support/inflector/transliterate.rb index 61651ba101..a6f57d73ac 100644 --- a/activesupport/lib/active_support/inflector/transliterate.rb +++ b/activesupport/lib/active_support/inflector/transliterate.rb @@ -61,13 +61,26 @@ module ActiveSupport # supported and will raise an ArgumentError. def transliterate(string, replacement = "?", locale: nil) raise ArgumentError, "Can only transliterate strings. Received #{string.class.name}" unless string.is_a?(String) - raise ArgumentError, "Can not transliterate strings with ASCII-8BIT encoding" if string.encoding == ::Encoding::ASCII_8BIT - I18n.transliterate( + allowed_encodings = [Encoding::UTF_8, Encoding::US_ASCII, Encoding::GB18030] + raise ArgumentError, "Can not transliterate strings with #{string.encoding} encoding" unless allowed_encodings.include?(string.encoding) + + input_encoding = string.encoding + + # US-ASCII is a subset so we'll force encoding as UTF-8 if US-ASCII is given + # This way we can hancle invalid bytes in the same way as we do for UTF-8 + string.force_encoding(Encoding::UTF_8) if string.encoding == Encoding::US_ASCII + + transliterated = I18n.transliterate( ActiveSupport::Multibyte::Unicode.tidy_bytes(string).unicode_normalize(:nfc), replacement: replacement, locale: locale ) + + # If we were given US-ASCII we give back US-ASCII + transliterated.force_encoding(Encoding::US_ASCII) if input_encoding == Encoding::US_ASCII + + transliterated end # Replaces special characters in a string so that it may be used as part of diff --git a/activesupport/test/transliterate_test.rb b/activesupport/test/transliterate_test.rb index 4101e4878a..47830946bf 100644 --- a/activesupport/test/transliterate_test.rb +++ b/activesupport/test/transliterate_test.rb @@ -67,7 +67,9 @@ class TransliterateTest < ActiveSupport::TestCase # Valid US-ASCII Works def test_transliterate_handles_strings_with_valid_us_ascii_encodings string = String.new("A", encoding: Encoding::US_ASCII) - assert_equal "A", ActiveSupport::Inflector.transliterate(string) + transcoded = ActiveSupport::Inflector.transliterate(string) + assert_equal "A", transcoded + assert_equal Encoding::US_ASCII, transcoded.encoding end # Valid GB18030 Works @@ -76,20 +78,19 @@ class TransliterateTest < ActiveSupport::TestCase assert_equal "A", ActiveSupport::Inflector.transliterate(string) end - # All other encodings raise exceptions + # All other encodings raise argument errors def test_transliterate_handles_strings_with_incompatible_encodings incompatible_encodings = Encoding.list - [ Encoding::UTF_8, Encoding::US_ASCII, - Encoding::GB18030, + Encoding::GB18030 ] - # This Raises an argument error - incompatible_encodings -= [Encoding::ASCII_8BIT] incompatible_encodings.each do |encoding| string = String.new("", encoding: encoding) - exception = assert_raises Encoding::CompatibilityError do + exception = assert_raises ArgumentError do ActiveSupport::Inflector.transliterate(string) end + assert_equal "Can not transliterate strings with #{encoding} encoding", exception.message end end @@ -102,9 +103,10 @@ class TransliterateTest < ActiveSupport::TestCase # Invalid raises exception def test_transliterate_handles_strings_with_invalid_us_ascii_bytes string = String.new("\255", encoding: Encoding::US_ASCII) - exception = assert_raises Encoding::CompatibilityError do - ActiveSupport::Inflector.transliterate(string) - end + # exception = assert_raises Encoding::CompatibilityError do + # ActiveSupport::Inflector.transliterate(string) + # end + assert_equal "?", ActiveSupport::Inflector.transliterate(string) end # Invalid GB18030 raises exception @@ -114,12 +116,4 @@ class TransliterateTest < ActiveSupport::TestCase ActiveSupport::Inflector.transliterate(string) end end - - def test_transliterate_handles_ascci_8bit_strings - ascii_8bit_string = "A".b - exception = assert_raises ArgumentError do - ActiveSupport::Inflector.transliterate(ascii_8bit_string) - end - assert_equal "Can not transliterate strings with ASCII-8BIT encoding", exception.message - end end -- cgit v1.2.3 From 05633d02d8ac8aa1289c0a01872e13e9b2449cd5 Mon Sep 17 00:00:00 2001 From: Cliff Pruitt Date: Fri, 26 Jul 2019 11:02:40 -0400 Subject: Handle GB18030 strings with invalid characters in transliterate GB18030 is Unicode compatible and covers all Unicode code points so we can temporarily convert GB18030 strings to UTF-8 to perform the transliteration. After transliterating we want to convert back to GB18030. In all cases of transcoding, we replace invalid or undefined characters with the default replacement character ("?"). This is in line with the behavior of tidy_bytes which is used on the UTF-8 string before transliterating. --- .../lib/active_support/inflector/transliterate.rb | 20 ++++++++++++++------ activesupport/test/transliterate_test.rb | 11 ++++------- 2 files changed, 18 insertions(+), 13 deletions(-) (limited to 'activesupport') diff --git a/activesupport/lib/active_support/inflector/transliterate.rb b/activesupport/lib/active_support/inflector/transliterate.rb index a6f57d73ac..0751f8a3ad 100644 --- a/activesupport/lib/active_support/inflector/transliterate.rb +++ b/activesupport/lib/active_support/inflector/transliterate.rb @@ -57,8 +57,8 @@ module ActiveSupport # transliterate('Jürgen', locale: :de) # # => "Juergen" # - # Transliteration of ASCII-8BIT / BINARY strings is not - # supported and will raise an ArgumentError. + # Transliteration is restricted to UTF-8, US-ASCII and GB18030 strings + # Other encodings will raise an ArgumentError. def transliterate(string, replacement = "?", locale: nil) raise ArgumentError, "Can only transliterate strings. Received #{string.class.name}" unless string.is_a?(String) @@ -67,18 +67,26 @@ module ActiveSupport input_encoding = string.encoding - # US-ASCII is a subset so we'll force encoding as UTF-8 if US-ASCII is given - # This way we can hancle invalid bytes in the same way as we do for UTF-8 + # US-ASCII is a subset of UTF-8 so we'll force encoding as UTF-8 if + # US-ASCII is given. This way we can let tidy_bytes handle the string + # in the same way as we do for UTF-8 string.force_encoding(Encoding::UTF_8) if string.encoding == Encoding::US_ASCII + # GB18030 is Unicode compatible but is not a direct mapping so needs to be + # transcoded. Using invalid/undef :replace will result in loss of data in + # the event of invalid characters, but since tidy_bytes will replace + # invalid/undef with a "?" we're safe to do the same beforehand + string.encode!(Encoding::UTF_8, invalid: :replace, undef: :replace) if string.encoding == Encoding::GB18030 + transliterated = I18n.transliterate( ActiveSupport::Multibyte::Unicode.tidy_bytes(string).unicode_normalize(:nfc), replacement: replacement, locale: locale ) - # If we were given US-ASCII we give back US-ASCII - transliterated.force_encoding(Encoding::US_ASCII) if input_encoding == Encoding::US_ASCII + # Restore the string encoding of the input if it was not UTF-8. + # Apply invalid/undef :replace as tidy_bytes does + transliterated.encode!(input_encoding, invalid: :replace, undef: :replace) if input_encoding != transliterated.encoding transliterated end diff --git a/activesupport/test/transliterate_test.rb b/activesupport/test/transliterate_test.rb index 47830946bf..ab7ffcaed0 100644 --- a/activesupport/test/transliterate_test.rb +++ b/activesupport/test/transliterate_test.rb @@ -75,7 +75,9 @@ class TransliterateTest < ActiveSupport::TestCase # Valid GB18030 Works def test_transliterate_handles_strings_with_valid_gb18030_encodings string = String.new("A", encoding: Encoding::GB18030) - assert_equal "A", ActiveSupport::Inflector.transliterate(string) + transcoded = ActiveSupport::Inflector.transliterate(string) + assert_equal "A", transcoded + assert_equal Encoding::GB18030, transcoded.encoding end # All other encodings raise argument errors @@ -103,17 +105,12 @@ class TransliterateTest < ActiveSupport::TestCase # Invalid raises exception def test_transliterate_handles_strings_with_invalid_us_ascii_bytes string = String.new("\255", encoding: Encoding::US_ASCII) - # exception = assert_raises Encoding::CompatibilityError do - # ActiveSupport::Inflector.transliterate(string) - # end assert_equal "?", ActiveSupport::Inflector.transliterate(string) end # Invalid GB18030 raises exception def test_transliterate_handles_strings_with_invalid_gb18030_bytes string = String.new("\255", encoding: Encoding::GB18030) - exception = assert_raises Encoding::CompatibilityError do - ActiveSupport::Inflector.transliterate(string) - end + assert_equal "?", ActiveSupport::Inflector.transliterate(string) end end -- cgit v1.2.3 From a7b6a9553bfb0a84546b0b74c71efc0688881127 Mon Sep 17 00:00:00 2001 From: Cliff Pruitt Date: Fri, 26 Jul 2019 12:42:06 -0400 Subject: Remove comments in test file --- activesupport/test/transliterate_test.rb | 7 ------- 1 file changed, 7 deletions(-) (limited to 'activesupport') diff --git a/activesupport/test/transliterate_test.rb b/activesupport/test/transliterate_test.rb index ab7ffcaed0..2e02b5e938 100644 --- a/activesupport/test/transliterate_test.rb +++ b/activesupport/test/transliterate_test.rb @@ -58,13 +58,11 @@ class TransliterateTest < ActiveSupport::TestCase assert_equal "Can only transliterate strings. Received Object", exception.message end - # Valid UTF-8 Works def test_transliterate_handles_strings_with_valid_utf8_encodings string = String.new("A", encoding: Encoding::UTF_8) assert_equal "A", ActiveSupport::Inflector.transliterate(string) end - # Valid US-ASCII Works def test_transliterate_handles_strings_with_valid_us_ascii_encodings string = String.new("A", encoding: Encoding::US_ASCII) transcoded = ActiveSupport::Inflector.transliterate(string) @@ -72,7 +70,6 @@ class TransliterateTest < ActiveSupport::TestCase assert_equal Encoding::US_ASCII, transcoded.encoding end - # Valid GB18030 Works def test_transliterate_handles_strings_with_valid_gb18030_encodings string = String.new("A", encoding: Encoding::GB18030) transcoded = ActiveSupport::Inflector.transliterate(string) @@ -80,7 +77,6 @@ class TransliterateTest < ActiveSupport::TestCase assert_equal Encoding::GB18030, transcoded.encoding end - # All other encodings raise argument errors def test_transliterate_handles_strings_with_incompatible_encodings incompatible_encodings = Encoding.list - [ Encoding::UTF_8, @@ -96,19 +92,16 @@ class TransliterateTest < ActiveSupport::TestCase end end - # Invalid UTF-8 Works def test_transliterate_handles_strings_with_invalid_utf8_bytes string = String.new("\255", encoding: Encoding::UTF_8) assert_equal "?", ActiveSupport::Inflector.transliterate(string) end - # Invalid raises exception def test_transliterate_handles_strings_with_invalid_us_ascii_bytes string = String.new("\255", encoding: Encoding::US_ASCII) assert_equal "?", ActiveSupport::Inflector.transliterate(string) end - # Invalid GB18030 raises exception def test_transliterate_handles_strings_with_invalid_gb18030_bytes string = String.new("\255", encoding: Encoding::GB18030) assert_equal "?", ActiveSupport::Inflector.transliterate(string) -- cgit v1.2.3