1 files changed, 95 insertions, 0 deletions
diff --git a/actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb b/actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb
new file mode 100644
index 0000000000..11bd48708e
--- /dev/null
+++ b/actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb
@@ -0,0 +1,95 @@
+require 'strscan'
+
+module HTML
+  
+  # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
+  # token is a string. Each string represents either "text", or an HTML element.
+  #
+  # This currently assumes valid XHTML, which means no free < or > characters.
+  #
+  # Usage:
+  #
+  #   tokenizer = HTML::Tokenizer.new(text)
+  #   while token = tokenizer.next
+  #     p token
+  #   end
+  class Tokenizer
+    
+    # The current (byte) position in the text
+    attr_reader :position
+    
+    # The current line number
+    attr_reader :line
+    
+    # Create a new Tokenizer for the given text.
+    def initialize(text)
+      @scanner = StringScanner.new(text)
+      @position = 0
+      @line = 0
+      @current_line = 1
+    end
+
+    # Return the next token in the sequence, or +nil+ if there are no more tokens in
+    # the stream.
+    def next
+      return nil if @scanner.eos?
+      @position = @scanner.pos
+      @line = @current_line
+      if @scanner.check(/<\S/)
+        update_current_line(scan_tag)
+      else
+        update_current_line(scan_text)
+      end
+    end
+  
+    private
+
+      # Treat the text at the current position as a tag, and scan it. Supports
+      # comments, doctype tags, and regular tags, and ignores less-than and
+      # greater-than characters within quoted strings.
+      def scan_tag
+        tag = @scanner.getch
+        if @scanner.scan(/!--/) # comment
+          tag << @scanner.matched
+          tag << @scanner.scan_until(/--\s*>/)
+        elsif @scanner.scan(/!/) # doctype
+          tag << @scanner.matched
+          tag << consume_quoted_regions
+        else
+          tag << consume_quoted_regions
+        end
+        tag
+      end
+
+      # Scan all text up to the next < character and return it.
+      def scan_text
+        @scanner.scan(/[^<]*/)
+      end
+      
+      # Counts the number of newlines in the text and updates the current line
+      # accordingly.
+      def update_current_line(text)
+        @current_line += text.scan(/\r\n|\r|\n/).length
+        text
+      end
+      
+      # Skips over quoted strings, so that less-than and greater-than characters
+      # within the strings are ignored.
+      def consume_quoted_regions
+        text = ""
+        loop do
+          match = @scanner.scan_until(/['">]/) or break
+          text << match
+          break if (delim = @scanner.matched) == ">"
+          # consume the conqued region
+          while match = @scanner.scan_until(/[\\#{delim}]/)
+            text << match
+            break if @scanner.matched == delim
+            text << @scanner.getch # skip the escaped character
+          end
+        end
+        text
+      end
+  end
+  
+end
+\ No newline at end of file