From 56e7b31487b838410d185eaf573359432ec2d11a Mon Sep 17 00:00:00 2001 From: Adam Roben Date: Wed, 13 Nov 2013 17:13:22 -0500 Subject: Refactor Unicode.unpack_graphemes slightly This will make it easier to add the rest of the rules listed in UAX 29. --- .../lib/active_support/multibyte/unicode.rb | 36 ++++++++++++++-------- 1 file changed, 23 insertions(+), 13 deletions(-) (limited to 'activesupport/lib/active_support') diff --git a/activesupport/lib/active_support/multibyte/unicode.rb b/activesupport/lib/active_support/multibyte/unicode.rb index 1845c6ae38..aea7709b55 100644 --- a/activesupport/lib/active_support/multibyte/unicode.rb +++ b/activesupport/lib/active_support/multibyte/unicode.rb @@ -89,19 +89,29 @@ module ActiveSupport pos += 1 previous = codepoints[pos-1] current = codepoints[pos] - if ( - # CR X LF - ( previous == database.boundary[:cr] and current == database.boundary[:lf] ) or - # L X (L|V|LV|LVT) - ( database.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or - # (LV|V) X (V|T) - ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or - # (LVT|T) X (T) - ( in_char_class?(previous, [:lvt,:t]) and database.boundary[:t] === current ) or - # X Extend - (database.boundary[:extend] === current) - ) - else + + should_break = + # GB3. CR X LF + if previous == database.boundary[:cr] and current == database.boundary[:lf] + false + # GB6. L X (L|V|LV|LVT) + elsif database.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) + false + # GB7. (LV|V) X (V|T) + elsif in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) + false + # GB8. (LVT|T) X (T) + elsif in_char_class?(previous, [:lvt,:t]) and database.boundary[:t] === current + false + # GB9. X Extend + elsif database.boundary[:extend] === current + false + # GB10. Any รท Any + else + true + end + + if should_break unpacked << codepoints[marker..pos-1] marker = pos end -- cgit v1.2.3