class Sanitize

Constants

REGEX_PROTOCOL

Matches an attribute value that could be treated by a browser as a URL with a protocol prefix, such as “http:” or “javascript:”. Any string of zero or more characters followed by a colon is considered a match, even if the colon is encoded as an entity and even if it's an incomplete entity (which IE6 and Opera will still parse).

REGEX_UNSUITABLE_CHARS

Matches Unicode characters that should be stripped from HTML before passing it to the parser.

www.w3.org/TR/unicode-xml/#Charlist

VERSION

Attributes

config[R]

Public Class Methods

clean(html, config = {})

@deprecated Use {.fragment} instead.

Alias for: fragment
clean_document(html, config = {})

@deprecated Use {.document} instead.

Alias for: document
clean_node!(node, config = {})

@deprecated Use {.node!} instead.

Alias for: node!
document(html, config = {}) click to toggle source

Returns a sanitized copy of the given full html document, using the settings in config if specified.

When sanitizing a document, the `<html>` element must be whitelisted or an error will be raised. If this is undesirable, you should probably use {#fragment} instead.

# File lib/sanitize.rb, line 44
def self.document(html, config = {})
  Sanitize.new(config).document(html)
end
Also aliased as: clean_document
fragment(html, config = {}) click to toggle source

Returns a sanitized copy of the given html fragment, using the settings in config if specified.

# File lib/sanitize.rb, line 50
def self.fragment(html, config = {})
  Sanitize.new(config).fragment(html)
end
Also aliased as: clean
new(config = {}) click to toggle source

Returns a new Sanitize object initialized with the settings in config.

# File lib/sanitize.rb, line 76
def initialize(config = {})
  @config = Config.merge(Config::DEFAULT, config)

  @transformers = Array(@config[:transformers]).dup

  # Default transformers always run at the end of the chain, after any custom
  # transformers.
  @transformers << Transformers::CleanElement.new(@config)
  @transformers << Transformers::CleanComment unless @config[:allow_comments]

  if @config[:elements].include?('style')
    scss = Sanitize::CSS.new(config)
    @transformers << Transformers::CSS::CleanElement.new(scss)
  end

  if @config[:attributes].values.any? {|attr| attr.include?('style') }
    scss ||= Sanitize::CSS.new(config)
    @transformers << Transformers::CSS::CleanAttribute.new(scss)
  end

  @transformers << Transformers::CleanDoctype
  @transformers << Transformers::CleanCDATA

  @transformer_config = { config: @config }
end
node!(node, config = {}) click to toggle source

Sanitizes the given `Nokogiri::XML::Node` instance and all its children.

# File lib/sanitize.rb, line 55
def self.node!(node, config = {})
  Sanitize.new(config).node!(node)
end
Also aliased as: clean_node!

Public Instance Methods

clean(html)

@deprecated Use {#fragment} instead.

Alias for: fragment
clean_document(html)

@deprecated Use {#document} instead.

Alias for: document
clean_node!(node)

@deprecated Use {#node!} instead.

Alias for: node!
document(html) click to toggle source

Returns a sanitized copy of the given html document.

When sanitizing a document, the `<html>` element must be whitelisted or an error will be raised. If this is undesirable, you should probably use {#fragment} instead.

# File lib/sanitize.rb, line 107
def document(html)
  return '' unless html

  doc = Nokogiri::HTML5.parse(preprocess(html))
  node!(doc)
  to_html(doc)
end
Also aliased as: clean_document
fragment(html) click to toggle source

Returns a sanitized copy of the given html fragment.

# File lib/sanitize.rb, line 119
def fragment(html)
  return '' unless html

  html = preprocess(html)
  doc  = Nokogiri::HTML5.parse("<html><body>#{html}")

  # Hack to allow fragments containing <body>. Borrowed from
  # Nokogiri::HTML::DocumentFragment.
  if html =~ /\A<body(?:\s|>)/i
    path = '/html/body'
  else
    path = '/html/body/node()'
  end

  frag = doc.fragment
  frag << doc.xpath(path)

  node!(frag)
  to_html(frag)
end
Also aliased as: clean
node!(node) click to toggle source

Sanitizes the given `Nokogiri::XML::Node` and all its children, modifying it in place.

If node is a `Nokogiri::XML::Document`, the `<html>` element must be whitelisted or an error will be raised.

# File lib/sanitize.rb, line 148
def node!(node)
  raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)

  if node.is_a?(Nokogiri::XML::Document)
    unless @config[:elements].include?('html')
      raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
    end
  end

  node_whitelist = Set.new

  traverse(node) do |n|
    transform_node!(n, node_whitelist)
  end

  node
end
Also aliased as: clean_node!

Private Instance Methods

preprocess(html) click to toggle source

Preprocesses HTML before parsing to remove undesirable Unicode chars.

# File lib/sanitize.rb, line 172
def preprocess(html)
  html = html.to_s.dup

  unless html.encoding.name == 'UTF-8'
    html.encode!('UTF-8',
      :invalid => :replace,
      :undef   => :replace)
  end

  html.gsub!(REGEX_UNSUITABLE_CHARS, '')
  html
end
to_html(node) click to toggle source
# File lib/sanitize.rb, line 185
def to_html(node)
  replace_meta = false

  # Hacky workaround for a libxml2 bug that adds an undesired Content-Type
  # meta tag to all serialized HTML documents.
  #
  # https://github.com/sparklemotion/nokogiri/issues/1008
  if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
      node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE

    regex_meta   = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i

    # Only replace the content-type meta tag if <meta> isn't whitelisted or
    # the original document didn't actually include a content-type meta tag.
    replace_meta = !@config[:elements].include?('meta') ||
      node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
        meta['http-equiv'].casecmp('content-type').zero?
      end
  end

  so = Nokogiri::XML::Node::SaveOptions

  # Serialize to HTML without any formatting to prevent Nokogiri from adding
  # newlines after certain tags.
  html = node.to_html(
    :encoding  => 'utf-8',
    :indent    => 0,
    :save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
  )

  html.gsub!(regex_meta, '\1') if replace_meta
  html
end
transform_node!(node, node_whitelist) click to toggle source
# File lib/sanitize.rb, line 219
def transform_node!(node, node_whitelist)
  @transformers.each do |transformer|
    # Since transform_node! may be called in a tight loop to process thousands
    # of items, we can optimize both memory and CPU performance by:
    #
    # 1. Reusing the same config hash for each transformer
    # 2. Directly assigning values to hash instead of using merge!. Not only
    # does merge! create a new hash, it is also 2.6x slower:
    # https://github.com/JuanitoFatas/fast-ruby#hashmerge-vs-hashmerge-code
    config = @transformer_config
    config[:is_whitelisted] = node_whitelist.include?(node)
    config[:node] = node
    config[:node_name] = node.name.downcase
    config[:node_whitelist] = node_whitelist

    result = transformer.call(config)

    if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
      node_whitelist.merge(result[:node_whitelist])
    end
  end

  node
end
traverse(node) { |node| ... } click to toggle source

Performs top-down traversal of the given node, operating first on the node itself, then traversing each child (if any) in order.

# File lib/sanitize.rb, line 246
def traverse(node, &block)
  yield node

  child = node.child

  while child do
    prev = child.previous_sibling
    traverse(child, &block)

    if child.parent == node
      child = child.next_sibling
    else
      # The child was unlinked or reparented, so traverse the previous node's
      # next sibling, or the parent's first child if there is no previous
      # node.
      child = prev ? prev.next_sibling : node.child
    end
  end
end