diff options
Diffstat (limited to 'actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb')
-rw-r--r-- | actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb | 95 |
1 files changed, 95 insertions, 0 deletions
diff --git a/actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb b/actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb new file mode 100644 index 0000000000..11bd48708e --- /dev/null +++ b/actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb @@ -0,0 +1,95 @@ +require 'strscan' + +module HTML + + # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each + # token is a string. Each string represents either "text", or an HTML element. + # + # This currently assumes valid XHTML, which means no free < or > characters. + # + # Usage: + # + # tokenizer = HTML::Tokenizer.new(text) + # while token = tokenizer.next + # p token + # end + class Tokenizer + + # The current (byte) position in the text + attr_reader :position + + # The current line number + attr_reader :line + + # Create a new Tokenizer for the given text. + def initialize(text) + @scanner = StringScanner.new(text) + @position = 0 + @line = 0 + @current_line = 1 + end + + # Return the next token in the sequence, or +nil+ if there are no more tokens in + # the stream. + def next + return nil if @scanner.eos? + @position = @scanner.pos + @line = @current_line + if @scanner.check(/<\S/) + update_current_line(scan_tag) + else + update_current_line(scan_text) + end + end + + private + + # Treat the text at the current position as a tag, and scan it. Supports + # comments, doctype tags, and regular tags, and ignores less-than and + # greater-than characters within quoted strings. + def scan_tag + tag = @scanner.getch + if @scanner.scan(/!--/) # comment + tag << @scanner.matched + tag << @scanner.scan_until(/--\s*>/) + elsif @scanner.scan(/!/) # doctype + tag << @scanner.matched + tag << consume_quoted_regions + else + tag << consume_quoted_regions + end + tag + end + + # Scan all text up to the next < character and return it. + def scan_text + @scanner.scan(/[^<]*/) + end + + # Counts the number of newlines in the text and updates the current line + # accordingly. + def update_current_line(text) + @current_line += text.scan(/\r\n|\r|\n/).length + text + end + + # Skips over quoted strings, so that less-than and greater-than characters + # within the strings are ignored. + def consume_quoted_regions + text = "" + loop do + match = @scanner.scan_until(/['">]/) or break + text << match + break if (delim = @scanner.matched) == ">" + # consume the conqued region + while match = @scanner.scan_until(/[\\#{delim}]/) + text << match + break if @scanner.matched == delim + text << @scanner.getch # skip the escaped character + end + end + text + end + end + +end
\ No newline at end of file |