aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--activesupport/lib/active_support/inflector/transliterate.rb20
-rw-r--r--activesupport/test/transliterate_test.rb11
2 files changed, 18 insertions, 13 deletions
diff --git a/activesupport/lib/active_support/inflector/transliterate.rb b/activesupport/lib/active_support/inflector/transliterate.rb
index a6f57d73ac..0751f8a3ad 100644
--- a/activesupport/lib/active_support/inflector/transliterate.rb
+++ b/activesupport/lib/active_support/inflector/transliterate.rb
@@ -57,8 +57,8 @@ module ActiveSupport
# transliterate('Jürgen', locale: :de)
# # => "Juergen"
#
- # Transliteration of ASCII-8BIT / BINARY strings is not
- # supported and will raise an ArgumentError.
+ # Transliteration is restricted to UTF-8, US-ASCII and GB18030 strings
+ # Other encodings will raise an ArgumentError.
def transliterate(string, replacement = "?", locale: nil)
raise ArgumentError, "Can only transliterate strings. Received #{string.class.name}" unless string.is_a?(String)
@@ -67,18 +67,26 @@ module ActiveSupport
input_encoding = string.encoding
- # US-ASCII is a subset so we'll force encoding as UTF-8 if US-ASCII is given
- # This way we can hancle invalid bytes in the same way as we do for UTF-8
+ # US-ASCII is a subset of UTF-8 so we'll force encoding as UTF-8 if
+ # US-ASCII is given. This way we can let tidy_bytes handle the string
+ # in the same way as we do for UTF-8
string.force_encoding(Encoding::UTF_8) if string.encoding == Encoding::US_ASCII
+ # GB18030 is Unicode compatible but is not a direct mapping so needs to be
+ # transcoded. Using invalid/undef :replace will result in loss of data in
+ # the event of invalid characters, but since tidy_bytes will replace
+ # invalid/undef with a "?" we're safe to do the same beforehand
+ string.encode!(Encoding::UTF_8, invalid: :replace, undef: :replace) if string.encoding == Encoding::GB18030
+
transliterated = I18n.transliterate(
ActiveSupport::Multibyte::Unicode.tidy_bytes(string).unicode_normalize(:nfc),
replacement: replacement,
locale: locale
)
- # If we were given US-ASCII we give back US-ASCII
- transliterated.force_encoding(Encoding::US_ASCII) if input_encoding == Encoding::US_ASCII
+ # Restore the string encoding of the input if it was not UTF-8.
+ # Apply invalid/undef :replace as tidy_bytes does
+ transliterated.encode!(input_encoding, invalid: :replace, undef: :replace) if input_encoding != transliterated.encoding
transliterated
end
diff --git a/activesupport/test/transliterate_test.rb b/activesupport/test/transliterate_test.rb
index 47830946bf..ab7ffcaed0 100644
--- a/activesupport/test/transliterate_test.rb
+++ b/activesupport/test/transliterate_test.rb
@@ -75,7 +75,9 @@ class TransliterateTest < ActiveSupport::TestCase
# Valid GB18030 Works
def test_transliterate_handles_strings_with_valid_gb18030_encodings
string = String.new("A", encoding: Encoding::GB18030)
- assert_equal "A", ActiveSupport::Inflector.transliterate(string)
+ transcoded = ActiveSupport::Inflector.transliterate(string)
+ assert_equal "A", transcoded
+ assert_equal Encoding::GB18030, transcoded.encoding
end
# All other encodings raise argument errors
@@ -103,17 +105,12 @@ class TransliterateTest < ActiveSupport::TestCase
# Invalid raises exception
def test_transliterate_handles_strings_with_invalid_us_ascii_bytes
string = String.new("\255", encoding: Encoding::US_ASCII)
- # exception = assert_raises Encoding::CompatibilityError do
- # ActiveSupport::Inflector.transliterate(string)
- # end
assert_equal "?", ActiveSupport::Inflector.transliterate(string)
end
# Invalid GB18030 raises exception
def test_transliterate_handles_strings_with_invalid_gb18030_bytes
string = String.new("\255", encoding: Encoding::GB18030)
- exception = assert_raises Encoding::CompatibilityError do
- ActiveSupport::Inflector.transliterate(string)
- end
+ assert_equal "?", ActiveSupport::Inflector.transliterate(string)
end
end