From 4940cc49ddb361d584d51bc3eb4675ff8ece4a2b Mon Sep 17 00:00:00 2001 From: Jeremy Daer Date: Fri, 9 Dec 2016 11:34:35 -0700 Subject: String#truncate_bytes: limit to N bytes without breaking multibyte chars This faithfully preserves grapheme clusters (characters composed of other characters and combining marks) and other multibyte characters. --- activesupport/CHANGELOG.md | 5 ++ .../lib/active_support/core_ext/string/filters.rb | 41 ++++++++++++++ activesupport/test/core_ext/string_ext_test.rb | 62 ++++++++++++++++++++++ 3 files changed, 108 insertions(+) diff --git a/activesupport/CHANGELOG.md b/activesupport/CHANGELOG.md index a844d1d8d7..66b7365916 100644 --- a/activesupport/CHANGELOG.md +++ b/activesupport/CHANGELOG.md @@ -1,5 +1,10 @@ ## Rails 6.0.0.alpha (Unreleased) ## +* `String#truncate_bytes` to truncate a string to a maximum bytesize without + breaking multibyte characters or grapheme clusters like 👩‍👩‍👦‍👦. + + *Jeremy Daer* + * `String#strip_heredoc` preserves frozenness. "foo".freeze.strip_heredoc.frozen? # => true diff --git a/activesupport/lib/active_support/core_ext/string/filters.rb b/activesupport/lib/active_support/core_ext/string/filters.rb index 66e721eea3..df0e79afa8 100644 --- a/activesupport/lib/active_support/core_ext/string/filters.rb +++ b/activesupport/lib/active_support/core_ext/string/filters.rb @@ -78,6 +78,47 @@ class String "#{self[0, stop]}#{omission}" end + # Truncates +text+ to at most bytesize bytes in length without + # breaking string encoding by splitting multibyte characters or breaking + # grapheme clusters ("perceptual characters") by truncating at combining + # characters. + # + # >> "🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪".size + # => 20 + # >> "🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪".bytesize + # => 80 + # >> "🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪🔪".truncate_bytes(20) + # => "🔪🔪🔪🔪…" + # + # The truncated text ends with the :omission string, defaulting + # to "…", for a total length not exceeding bytesize. + def truncate_bytes(truncate_at, omission: "…") + omission ||= "" + + case + when bytesize <= truncate_at + dup + when omission.bytesize > truncate_at + raise ArgumentError, "Omission #{omission.inspect} is #{omission.bytesize}, larger than the truncation length of #{truncate_at} bytes" + when omission.bytesize == truncate_at + omission.dup + else + self.class.new.tap do |cut| + cut_at = truncate_at - omission.bytesize + + scan(/\X/) do |grapheme| + if cut.bytesize + grapheme.bytesize <= cut_at + cut << grapheme + else + break + end + end + + cut << omission + end + end + end + # Truncates a given +text+ after a given number of words (words_count): # # 'Once upon a time in a world far far away'.truncate_words(4) diff --git a/activesupport/test/core_ext/string_ext_test.rb b/activesupport/test/core_ext/string_ext_test.rb index b95d2d8307..ccaa5dc786 100644 --- a/activesupport/test/core_ext/string_ext_test.rb +++ b/activesupport/test/core_ext/string_ext_test.rb @@ -285,6 +285,68 @@ class StringInflectionsTest < ActiveSupport::TestCase assert_equal "Hello Big[...]", "Hello Big World!".truncate(15, omission: "[...]", separator: /\s/) end + def test_truncate_bytes + assert_equal "👍👍👍👍", "👍👍👍👍".truncate_bytes(16) + assert_equal "👍👍👍👍", "👍👍👍👍".truncate_bytes(16, omission: nil) + assert_equal "👍👍👍👍", "👍👍👍👍".truncate_bytes(16, omission: " ") + assert_equal "👍👍👍👍", "👍👍👍👍".truncate_bytes(16, omission: "🖖") + + assert_equal "👍👍👍…", "👍👍👍👍".truncate_bytes(15) + assert_equal "👍👍👍", "👍👍👍👍".truncate_bytes(15, omission: nil) + assert_equal "👍👍👍 ", "👍👍👍👍".truncate_bytes(15, omission: " ") + assert_equal "👍👍🖖", "👍👍👍👍".truncate_bytes(15, omission: "🖖") + + assert_equal "…", "👍👍👍👍".truncate_bytes(5) + assert_equal "👍", "👍👍👍👍".truncate_bytes(5, omission: nil) + assert_equal "👍 ", "👍👍👍👍".truncate_bytes(5, omission: " ") + assert_equal "🖖", "👍👍👍👍".truncate_bytes(5, omission: "🖖") + + assert_equal "…", "👍👍👍👍".truncate_bytes(4) + assert_equal "👍", "👍👍👍👍".truncate_bytes(4, omission: nil) + assert_equal " ", "👍👍👍👍".truncate_bytes(4, omission: " ") + assert_equal "🖖", "👍👍👍👍".truncate_bytes(4, omission: "🖖") + + assert_raise ArgumentError do + "👍👍👍👍".truncate_bytes(3, omission: "🖖") + end + end + + def test_truncate_bytes_preserves_codepoints + assert_equal "👍👍👍👍", "👍👍👍👍".truncate_bytes(16) + assert_equal "👍👍👍👍", "👍👍👍👍".truncate_bytes(16, omission: nil) + assert_equal "👍👍👍👍", "👍👍👍👍".truncate_bytes(16, omission: " ") + assert_equal "👍👍👍👍", "👍👍👍👍".truncate_bytes(16, omission: "🖖") + + assert_equal "👍👍👍…", "👍👍👍👍".truncate_bytes(15) + assert_equal "👍👍👍", "👍👍👍👍".truncate_bytes(15, omission: nil) + assert_equal "👍👍👍 ", "👍👍👍👍".truncate_bytes(15, omission: " ") + assert_equal "👍👍🖖", "👍👍👍👍".truncate_bytes(15, omission: "🖖") + + assert_equal "…", "👍👍👍👍".truncate_bytes(5) + assert_equal "👍", "👍👍👍👍".truncate_bytes(5, omission: nil) + assert_equal "👍 ", "👍👍👍👍".truncate_bytes(5, omission: " ") + assert_equal "🖖", "👍👍👍👍".truncate_bytes(5, omission: "🖖") + + assert_equal "…", "👍👍👍👍".truncate_bytes(4) + assert_equal "👍", "👍👍👍👍".truncate_bytes(4, omission: nil) + assert_equal " ", "👍👍👍👍".truncate_bytes(4, omission: " ") + assert_equal "🖖", "👍👍👍👍".truncate_bytes(4, omission: "🖖") + + assert_raise ArgumentError do + "👍👍👍👍".truncate_bytes(3, omission: "🖖") + end + end + + def test_truncates_bytes_preserves_grapheme_clusters + assert_equal "a ", "a ❤️ b".truncate_bytes(2, omission: nil) + assert_equal "a ", "a ❤️ b".truncate_bytes(3, omission: nil) + assert_equal "a ", "a ❤️ b".truncate_bytes(7, omission: nil) + assert_equal "a ❤️", "a ❤️ b".truncate_bytes(8, omission: nil) + + assert_equal "a ", "a 👩‍❤️‍👩".truncate_bytes(13, omission: nil) + assert_equal "", "👩‍❤️‍👩".truncate_bytes(13, omission: nil) + end + def test_truncate_words assert_equal "Hello Big World!", "Hello Big World!".truncate_words(3) assert_equal "Hello Big...", "Hello Big World!".truncate_words(2) -- cgit v1.2.3