class Nokogumbo

Public Class Methods

parse(p1, p2) click to toggle source
static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
  GumboOptions options;
  memcpy(&options, &kGumboDefaultOptions, sizeof options);
  options.max_errors = NUM2INT(max_parse_errors);

  const char *input = RSTRING_PTR(string);
  size_t input_len = RSTRING_LEN(string);
  GumboOutput *output = gumbo_parse_with_options(&options, input, input_len);
  xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
#ifdef NGLIB
  doc->type = XML_HTML_DOCUMENT_NODE;
#endif
  if (output->document->v.document.has_doctype) {
    const char *name   = output->document->v.document.name;
    const char *public = output->document->v.document.public_identifier;
    const char *system = output->document->v.document.system_identifier;
    xmlCreateIntSubset(doc, CONST_CAST name,
      (public[0] ? CONST_CAST public : NIL),
      (system[0] ? CONST_CAST system : NIL));
  }

  GumboVector *children = &output->document->v.document.children;
  for (int i=0; i < children->length; i++) {
    GumboNode *child = children->data[i];
    xmlNodePtr node = walk_tree(doc, child);
    if (node) {
      if (child == output->root)
        xmlDocSetRootElement(doc, node);
      else
        xmlAddChild((xmlNodePtr)doc, node);
    }
  }

  VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc);

  // Add parse errors to rdoc.
  if (output->errors.length) {
    GumboVector *errors = &output->errors;
    GumboParser parser = { ._options = &options };
    GumboStringBuffer msg;
    VALUE rerrors = rb_ary_new2(errors->length);

    gumbo_string_buffer_init(&parser, &msg);
    for (int i=0; i < errors->length; i++) {
      GumboError *err = errors->data[i];
      gumbo_string_buffer_clear(&parser, &msg);
      // Work around bug in gumbo_caret_diagnostic_to_string.
      // See https://github.com/google/gumbo-parser/pull/371
      // The bug occurs when the error starts with a newline (unless it's the
      // first character in the input--but that shouldn't cause an error in
      // the first place.
      if (*err->original_text == '\n' && err->original_text != input)
        --err->original_text;
      gumbo_caret_diagnostic_to_string(&parser, err, input, &msg);
      VALUE err_str = rb_str_new(msg.data, msg.length);
      VALUE syntax_error = rb_class_new_instance(1, &err_str, XMLSyntaxError);
      rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
      rb_iv_set(syntax_error, "@code", INT2NUM(1));   // XML_ERR_INTERNAL_ERROR
      rb_iv_set(syntax_error, "@level", INT2NUM(2));  // XML_ERR_ERROR
      rb_iv_set(syntax_error, "@file", Qnil);
      rb_iv_set(syntax_error, "@line", INT2NUM(err->position.line));
      rb_iv_set(syntax_error, "@str1", Qnil);
      rb_iv_set(syntax_error, "@str2", Qnil);
      rb_iv_set(syntax_error, "@str3", Qnil);
      rb_iv_set(syntax_error, "@int1", INT2NUM(err->type));
      rb_iv_set(syntax_error, "@column", INT2NUM(err->position.column));
      rb_ary_push(rerrors, syntax_error);
    }
    rb_iv_set(rdoc, "@errors", rerrors);
    gumbo_string_buffer_destroy(&parser, &msg);
  }

  gumbo_destroy_output(&options, output);

  return rdoc;
}