Move action_controller/vendor/html-scanner to action_view

This is another step in moving Action View's dependencies in Action Pack to Action View itself. Also, HtmlScanner seems to be better suited for views rather than controllers.
author: Piotr Sarnacki <drogus@gmail.com> 2012-06-09 14:40:50 +0200
committer: Piotr Sarnacki <drogus@gmail.com> 2012-08-28 10:51:03 +0200
commit: ba83aa7f03d2742dd242a45229e0f67785871515 (patch)
tree: ff35607be2e123302ba0db65f5c644d577c05dae /actionpack/lib/action_view/vendor/html-scanner/html/tokenizer.rb
parent: 4efad291c11a7c4fcf178fbb887b4845eaf61757 (diff)
download: rails-ba83aa7f03d2742dd242a45229e0f67785871515.tar.gz
rails-ba83aa7f03d2742dd242a45229e0f67785871515.tar.bz2
rails-ba83aa7f03d2742dd242a45229e0f67785871515.zip
1 files changed, 107 insertions, 0 deletions
diff --git a/actionpack/lib/action_view/vendor/html-scanner/html/tokenizer.rb b/actionpack/lib/action_view/vendor/html-scanner/html/tokenizer.rb
new file mode 100644
index 0000000000..8ac8d34430
--- /dev/null
+++ b/actionpack/lib/action_view/vendor/html-scanner/html/tokenizer.rb
@@ -0,0 +1,107 @@
+require 'strscan'
+
+module HTML #:nodoc:
+
+  # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
+  # token is a string. Each string represents either "text", or an HTML element.
+  #
+  # This currently assumes valid XHTML, which means no free < or > characters.
+  #
+  # Usage:
+  #
+  #   tokenizer = HTML::Tokenizer.new(text)
+  #   while token = tokenizer.next
+  #     p token
+  #   end
+  class Tokenizer #:nodoc:
+
+    # The current (byte) position in the text
+    attr_reader :position
+
+    # The current line number
+    attr_reader :line
+
+    # Create a new Tokenizer for the given text.
+    def initialize(text)
+      text.encode!
+      @scanner = StringScanner.new(text)
+      @position = 0
+      @line = 0
+      @current_line = 1
+    end
+
+    # Return the next token in the sequence, or +nil+ if there are no more tokens in
+    # the stream.
+    def next
+      return nil if @scanner.eos?
+      @position = @scanner.pos
+      @line = @current_line
+      if @scanner.check(/<\S/)
+        update_current_line(scan_tag)
+      else
+        update_current_line(scan_text)
+      end
+    end
+
+    private
+
+      # Treat the text at the current position as a tag, and scan it. Supports
+      # comments, doctype tags, and regular tags, and ignores less-than and
+      # greater-than characters within quoted strings.
+      def scan_tag
+        tag = @scanner.getch
+        if @scanner.scan(/!--/) # comment
+          tag << @scanner.matched
+          tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/))
+        elsif @scanner.scan(/!\[CDATA\[/)
+          tag << @scanner.matched
+          tag << (@scanner.scan_until(/\]\]>/) || @scanner.scan_until(/\Z/))
+        elsif @scanner.scan(/!/) # doctype
+          tag << @scanner.matched
+          tag << consume_quoted_regions
+        else
+          tag << consume_quoted_regions
+        end
+        tag
+      end
+
+      # Scan all text up to the next < character and return it.
+      def scan_text
+        "#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
+      end
+
+      # Counts the number of newlines in the text and updates the current line
+      # accordingly.
+      def update_current_line(text)
+        text.scan(/\r?\n/) { @current_line += 1 }
+      end
+
+      # Skips over quoted strings, so that less-than and greater-than characters
+      # within the strings are ignored.
+      def consume_quoted_regions
+        text = ""
+        loop do
+          match = @scanner.scan_until(/['"<>]/) or break
+
+          delim = @scanner.matched
+          if delim == "<"
+            match = match.chop
+            @scanner.pos -= 1
+          end
+
+          text << match
+          break if delim == "<" || delim == ">"
+
+          # consume the quoted region
+          while match = @scanner.scan_until(/[\\#{delim}]/)
+            text << match
+            break if @scanner.matched == delim
+            break if @scanner.eos?
+            text << @scanner.getch # skip the escaped character
+          end
+        end
+        text
+      end
+  end
+
+end
author	Piotr Sarnacki <drogus@gmail.com>	2012-06-09 14:40:50 +0200
committer	Piotr Sarnacki <drogus@gmail.com>	2012-08-28 10:51:03 +0200
commit	ba83aa7f03d2742dd242a45229e0f67785871515 (patch)
tree	ff35607be2e123302ba0db65f5c644d577c05dae /actionpack/lib/action_view/vendor/html-scanner/html/tokenizer.rb
parent	4efad291c11a7c4fcf178fbb887b4845eaf61757 (diff)
download	rails-ba83aa7f03d2742dd242a45229e0f67785871515.tar.gz rails-ba83aa7f03d2742dd242a45229e0f67785871515.tar.bz2 rails-ba83aa7f03d2742dd242a45229e0f67785871515.zip