From 0d6e8edc2a47a4b4c6824936632bfb83850db343 Mon Sep 17 00:00:00 2001 From: Piotr Sarnacki Date: Sat, 4 May 2013 15:09:22 +0200 Subject: Move actionpack/lib/action_view* into actionview/lib --- .../vendor/html-scanner/html/tokenizer.rb | 107 +++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 actionview/lib/action_view/vendor/html-scanner/html/tokenizer.rb (limited to 'actionview/lib/action_view/vendor/html-scanner/html/tokenizer.rb') diff --git a/actionview/lib/action_view/vendor/html-scanner/html/tokenizer.rb b/actionview/lib/action_view/vendor/html-scanner/html/tokenizer.rb new file mode 100644 index 0000000000..8ac8d34430 --- /dev/null +++ b/actionview/lib/action_view/vendor/html-scanner/html/tokenizer.rb @@ -0,0 +1,107 @@ +require 'strscan' + +module HTML #:nodoc: + + # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each + # token is a string. Each string represents either "text", or an HTML element. + # + # This currently assumes valid XHTML, which means no free < or > characters. + # + # Usage: + # + # tokenizer = HTML::Tokenizer.new(text) + # while token = tokenizer.next + # p token + # end + class Tokenizer #:nodoc: + + # The current (byte) position in the text + attr_reader :position + + # The current line number + attr_reader :line + + # Create a new Tokenizer for the given text. + def initialize(text) + text.encode! + @scanner = StringScanner.new(text) + @position = 0 + @line = 0 + @current_line = 1 + end + + # Return the next token in the sequence, or +nil+ if there are no more tokens in + # the stream. + def next + return nil if @scanner.eos? + @position = @scanner.pos + @line = @current_line + if @scanner.check(/<\S/) + update_current_line(scan_tag) + else + update_current_line(scan_text) + end + end + + private + + # Treat the text at the current position as a tag, and scan it. Supports + # comments, doctype tags, and regular tags, and ignores less-than and + # greater-than characters within quoted strings. + def scan_tag + tag = @scanner.getch + if @scanner.scan(/!--/) # comment + tag << @scanner.matched + tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/)) + elsif @scanner.scan(/!\[CDATA\[/) + tag << @scanner.matched + tag << (@scanner.scan_until(/\]\]>/) || @scanner.scan_until(/\Z/)) + elsif @scanner.scan(/!/) # doctype + tag << @scanner.matched + tag << consume_quoted_regions + else + tag << consume_quoted_regions + end + tag + end + + # Scan all text up to the next < character and return it. + def scan_text + "#{@scanner.getch}#{@scanner.scan(/[^<]*/)}" + end + + # Counts the number of newlines in the text and updates the current line + # accordingly. + def update_current_line(text) + text.scan(/\r?\n/) { @current_line += 1 } + end + + # Skips over quoted strings, so that less-than and greater-than characters + # within the strings are ignored. + def consume_quoted_regions + text = "" + loop do + match = @scanner.scan_until(/['"<>]/) or break + + delim = @scanner.matched + if delim == "<" + match = match.chop + @scanner.pos -= 1 + end + + text << match + break if delim == "<" || delim == ">" + + # consume the quoted region + while match = @scanner.scan_until(/[\\#{delim}]/) + text << match + break if @scanner.matched == delim + break if @scanner.eos? + text << @scanner.getch # skip the escaped character + end + end + text + end + end + +end -- cgit v1.2.3