diff options
author | Piotr Sarnacki <drogus@gmail.com> | 2013-06-20 15:42:49 -0700 |
---|---|---|
committer | Piotr Sarnacki <drogus@gmail.com> | 2013-06-20 15:42:49 -0700 |
commit | a29f746398e7b0647885343e7f26d977dd251999 (patch) | |
tree | 1e2cd2ee1f8f31812c0acf71350ffe423ca8c5a9 /actionview/lib/action_view/vendor/html-scanner/html/tokenizer.rb | |
parent | 7c69a829a311a31109939cff19b700b36b97d5c4 (diff) | |
parent | d6b1caa8f2011487c08b414605883f1f220d0aaa (diff) | |
download | rails-a29f746398e7b0647885343e7f26d977dd251999.tar.gz rails-a29f746398e7b0647885343e7f26d977dd251999.tar.bz2 rails-a29f746398e7b0647885343e7f26d977dd251999.zip |
Merge pull request #11032 from strzalek/extract-actionview
Extract ActionView to separate directory
Diffstat (limited to 'actionview/lib/action_view/vendor/html-scanner/html/tokenizer.rb')
-rw-r--r-- | actionview/lib/action_view/vendor/html-scanner/html/tokenizer.rb | 107 |
1 files changed, 107 insertions, 0 deletions
diff --git a/actionview/lib/action_view/vendor/html-scanner/html/tokenizer.rb b/actionview/lib/action_view/vendor/html-scanner/html/tokenizer.rb new file mode 100644 index 0000000000..8ac8d34430 --- /dev/null +++ b/actionview/lib/action_view/vendor/html-scanner/html/tokenizer.rb @@ -0,0 +1,107 @@ +require 'strscan' + +module HTML #:nodoc: + + # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each + # token is a string. Each string represents either "text", or an HTML element. + # + # This currently assumes valid XHTML, which means no free < or > characters. + # + # Usage: + # + # tokenizer = HTML::Tokenizer.new(text) + # while token = tokenizer.next + # p token + # end + class Tokenizer #:nodoc: + + # The current (byte) position in the text + attr_reader :position + + # The current line number + attr_reader :line + + # Create a new Tokenizer for the given text. + def initialize(text) + text.encode! + @scanner = StringScanner.new(text) + @position = 0 + @line = 0 + @current_line = 1 + end + + # Return the next token in the sequence, or +nil+ if there are no more tokens in + # the stream. + def next + return nil if @scanner.eos? + @position = @scanner.pos + @line = @current_line + if @scanner.check(/<\S/) + update_current_line(scan_tag) + else + update_current_line(scan_text) + end + end + + private + + # Treat the text at the current position as a tag, and scan it. Supports + # comments, doctype tags, and regular tags, and ignores less-than and + # greater-than characters within quoted strings. + def scan_tag + tag = @scanner.getch + if @scanner.scan(/!--/) # comment + tag << @scanner.matched + tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/)) + elsif @scanner.scan(/!\[CDATA\[/) + tag << @scanner.matched + tag << (@scanner.scan_until(/\]\]>/) || @scanner.scan_until(/\Z/)) + elsif @scanner.scan(/!/) # doctype + tag << @scanner.matched + tag << consume_quoted_regions + else + tag << consume_quoted_regions + end + tag + end + + # Scan all text up to the next < character and return it. + def scan_text + "#{@scanner.getch}#{@scanner.scan(/[^<]*/)}" + end + + # Counts the number of newlines in the text and updates the current line + # accordingly. + def update_current_line(text) + text.scan(/\r?\n/) { @current_line += 1 } + end + + # Skips over quoted strings, so that less-than and greater-than characters + # within the strings are ignored. + def consume_quoted_regions + text = "" + loop do + match = @scanner.scan_until(/['"<>]/) or break + + delim = @scanner.matched + if delim == "<" + match = match.chop + @scanner.pos -= 1 + end + + text << match + break if delim == "<" || delim == ">" + + # consume the quoted region + while match = @scanner.scan_until(/[\\#{delim}]/) + text << match + break if @scanner.matched == delim + break if @scanner.eos? + text << @scanner.getch # skip the escaped character + end + end + text + end + end + +end |