aboutsummaryrefslogtreecommitdiffstats
path: root/actionpack/lib/action_view/vendor/html-scanner/html/tokenizer.rb
diff options
context:
space:
mode:
authorPiotr Sarnacki <drogus@gmail.com>2012-06-09 14:40:50 +0200
committerPiotr Sarnacki <drogus@gmail.com>2012-08-28 10:51:03 +0200
commitba83aa7f03d2742dd242a45229e0f67785871515 (patch)
treeff35607be2e123302ba0db65f5c644d577c05dae /actionpack/lib/action_view/vendor/html-scanner/html/tokenizer.rb
parent4efad291c11a7c4fcf178fbb887b4845eaf61757 (diff)
downloadrails-ba83aa7f03d2742dd242a45229e0f67785871515.tar.gz
rails-ba83aa7f03d2742dd242a45229e0f67785871515.tar.bz2
rails-ba83aa7f03d2742dd242a45229e0f67785871515.zip
Move action_controller/vendor/html-scanner to action_view
This is another step in moving Action View's dependencies in Action Pack to Action View itself. Also, HtmlScanner seems to be better suited for views rather than controllers.
Diffstat (limited to 'actionpack/lib/action_view/vendor/html-scanner/html/tokenizer.rb')
-rw-r--r--actionpack/lib/action_view/vendor/html-scanner/html/tokenizer.rb107
1 files changed, 107 insertions, 0 deletions
diff --git a/actionpack/lib/action_view/vendor/html-scanner/html/tokenizer.rb b/actionpack/lib/action_view/vendor/html-scanner/html/tokenizer.rb
new file mode 100644
index 0000000000..8ac8d34430
--- /dev/null
+++ b/actionpack/lib/action_view/vendor/html-scanner/html/tokenizer.rb
@@ -0,0 +1,107 @@
+require 'strscan'
+
+module HTML #:nodoc:
+
+ # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
+ # token is a string. Each string represents either "text", or an HTML element.
+ #
+ # This currently assumes valid XHTML, which means no free < or > characters.
+ #
+ # Usage:
+ #
+ # tokenizer = HTML::Tokenizer.new(text)
+ # while token = tokenizer.next
+ # p token
+ # end
+ class Tokenizer #:nodoc:
+
+ # The current (byte) position in the text
+ attr_reader :position
+
+ # The current line number
+ attr_reader :line
+
+ # Create a new Tokenizer for the given text.
+ def initialize(text)
+ text.encode!
+ @scanner = StringScanner.new(text)
+ @position = 0
+ @line = 0
+ @current_line = 1
+ end
+
+ # Return the next token in the sequence, or +nil+ if there are no more tokens in
+ # the stream.
+ def next
+ return nil if @scanner.eos?
+ @position = @scanner.pos
+ @line = @current_line
+ if @scanner.check(/<\S/)
+ update_current_line(scan_tag)
+ else
+ update_current_line(scan_text)
+ end
+ end
+
+ private
+
+ # Treat the text at the current position as a tag, and scan it. Supports
+ # comments, doctype tags, and regular tags, and ignores less-than and
+ # greater-than characters within quoted strings.
+ def scan_tag
+ tag = @scanner.getch
+ if @scanner.scan(/!--/) # comment
+ tag << @scanner.matched
+ tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/))
+ elsif @scanner.scan(/!\[CDATA\[/)
+ tag << @scanner.matched
+ tag << (@scanner.scan_until(/\]\]>/) || @scanner.scan_until(/\Z/))
+ elsif @scanner.scan(/!/) # doctype
+ tag << @scanner.matched
+ tag << consume_quoted_regions
+ else
+ tag << consume_quoted_regions
+ end
+ tag
+ end
+
+ # Scan all text up to the next < character and return it.
+ def scan_text
+ "#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
+ end
+
+ # Counts the number of newlines in the text and updates the current line
+ # accordingly.
+ def update_current_line(text)
+ text.scan(/\r?\n/) { @current_line += 1 }
+ end
+
+ # Skips over quoted strings, so that less-than and greater-than characters
+ # within the strings are ignored.
+ def consume_quoted_regions
+ text = ""
+ loop do
+ match = @scanner.scan_until(/['"<>]/) or break
+
+ delim = @scanner.matched
+ if delim == "<"
+ match = match.chop
+ @scanner.pos -= 1
+ end
+
+ text << match
+ break if delim == "<" || delim == ">"
+
+ # consume the quoted region
+ while match = @scanner.scan_until(/[\\#{delim}]/)
+ text << match
+ break if @scanner.matched == delim
+ break if @scanner.eos?
+ text << @scanner.getch # skip the escaped character
+ end
+ end
+ text
+ end
+ end
+
+end