require 'strscan'

module HTML
  
  # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
  # token is a string. Each string represents either "text", or an HTML element.
  #
  # This currently assumes valid XHTML, which means no free < or > characters.
  #
  # Usage:
  #
  #   tokenizer = HTML::Tokenizer.new(text)
  #   while token = tokenizer.next
  #     p token
  #   end
  class Tokenizer
    
    # The current (byte) position in the text
    attr_reader :position
    
    # The current line number
    attr_reader :line
    
    # Create a new Tokenizer for the given text.
    def initialize(text)
      @scanner = StringScanner.new(text)
      @position = 0
      @line = 0
      @current_line = 1
    end

    # Return the next token in the sequence, or +nil+ if there are no more tokens in
    # the stream.
    def next
      return nil if @scanner.eos?
      @position = @scanner.pos
      @line = @current_line
      if @scanner.check(/<\S/)
        update_current_line(scan_tag)
      else
        update_current_line(scan_text)
      end
    end
  
    private

      # Treat the text at the current position as a tag, and scan it. Supports
      # comments, doctype tags, and regular tags, and ignores less-than and
      # greater-than characters within quoted strings.
      def scan_tag
        tag = @scanner.getch
        if @scanner.scan(/!--/) # comment
          tag << @scanner.matched
          tag << @scanner.scan_until(/--\s*>/)
        elsif @scanner.scan(/!/) # doctype
          tag << @scanner.matched
          tag << consume_quoted_regions
        else
          tag << consume_quoted_regions
        end
        tag
      end

      # Scan all text up to the next < character and return it.
      def scan_text
        @scanner.scan(/[^<]*/)
      end
      
      # Counts the number of newlines in the text and updates the current line
      # accordingly.
      def update_current_line(text)
        @current_line += text.scan(/\r\n|\r|\n/).length
        text
      end
      
      # Skips over quoted strings, so that less-than and greater-than characters
      # within the strings are ignored.
      def consume_quoted_regions
        text = ""
        loop do
          match = @scanner.scan_until(/['">]/) or break
          text << match
          break if (delim = @scanner.matched) == ">"
          # consume the conqued region
          while match = @scanner.scan_until(/[\\#{delim}]/)
            text << match
            break if @scanner.matched == delim
            text << @scanner.getch # skip the escaped character
          end
        end
        text
      end
  end
  
end