aboutsummaryrefslogtreecommitdiffstats
path: root/actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb
diff options
context:
space:
mode:
Diffstat (limited to 'actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb')
-rw-r--r--actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb95
1 files changed, 95 insertions, 0 deletions
diff --git a/actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb b/actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb
new file mode 100644
index 0000000000..11bd48708e
--- /dev/null
+++ b/actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb
@@ -0,0 +1,95 @@
+require 'strscan'
+
+module HTML
+
+ # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
+ # token is a string. Each string represents either "text", or an HTML element.
+ #
+ # This currently assumes valid XHTML, which means no free < or > characters.
+ #
+ # Usage:
+ #
+ # tokenizer = HTML::Tokenizer.new(text)
+ # while token = tokenizer.next
+ # p token
+ # end
+ class Tokenizer
+
+ # The current (byte) position in the text
+ attr_reader :position
+
+ # The current line number
+ attr_reader :line
+
+ # Create a new Tokenizer for the given text.
+ def initialize(text)
+ @scanner = StringScanner.new(text)
+ @position = 0
+ @line = 0
+ @current_line = 1
+ end
+
+ # Return the next token in the sequence, or +nil+ if there are no more tokens in
+ # the stream.
+ def next
+ return nil if @scanner.eos?
+ @position = @scanner.pos
+ @line = @current_line
+ if @scanner.check(/<\S/)
+ update_current_line(scan_tag)
+ else
+ update_current_line(scan_text)
+ end
+ end
+
+ private
+
+ # Treat the text at the current position as a tag, and scan it. Supports
+ # comments, doctype tags, and regular tags, and ignores less-than and
+ # greater-than characters within quoted strings.
+ def scan_tag
+ tag = @scanner.getch
+ if @scanner.scan(/!--/) # comment
+ tag << @scanner.matched
+ tag << @scanner.scan_until(/--\s*>/)
+ elsif @scanner.scan(/!/) # doctype
+ tag << @scanner.matched
+ tag << consume_quoted_regions
+ else
+ tag << consume_quoted_regions
+ end
+ tag
+ end
+
+ # Scan all text up to the next < character and return it.
+ def scan_text
+ @scanner.scan(/[^<]*/)
+ end
+
+ # Counts the number of newlines in the text and updates the current line
+ # accordingly.
+ def update_current_line(text)
+ @current_line += text.scan(/\r\n|\r|\n/).length
+ text
+ end
+
+ # Skips over quoted strings, so that less-than and greater-than characters
+ # within the strings are ignored.
+ def consume_quoted_regions
+ text = ""
+ loop do
+ match = @scanner.scan_until(/['">]/) or break
+ text << match
+ break if (delim = @scanner.matched) == ">"
+ # consume the conqued region
+ while match = @scanner.scan_until(/[\\#{delim}]/)
+ text << match
+ break if @scanner.matched == delim
+ text << @scanner.getch # skip the escaped character
+ end
+ end
+ text
+ end
+ end
+
+end \ No newline at end of file