class Nokogumbo
Public Class Methods
parse(p1, p2)
click to toggle source
static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) { GumboOptions options; memcpy(&options, &kGumboDefaultOptions, sizeof options); options.max_errors = NUM2INT(max_parse_errors); const char *input = RSTRING_PTR(string); size_t input_len = RSTRING_LEN(string); GumboOutput *output = gumbo_parse_with_options(&options, input, input_len); xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0"); #ifdef NGLIB doc->type = XML_HTML_DOCUMENT_NODE; #endif if (output->document->v.document.has_doctype) { const char *name = output->document->v.document.name; const char *public = output->document->v.document.public_identifier; const char *system = output->document->v.document.system_identifier; xmlCreateIntSubset(doc, CONST_CAST name, (public[0] ? CONST_CAST public : NIL), (system[0] ? CONST_CAST system : NIL)); } GumboVector *children = &output->document->v.document.children; for (int i=0; i < children->length; i++) { GumboNode *child = children->data[i]; xmlNodePtr node = walk_tree(doc, child); if (node) { if (child == output->root) xmlDocSetRootElement(doc, node); else xmlAddChild((xmlNodePtr)doc, node); } } VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc); // Add parse errors to rdoc. if (output->errors.length) { GumboVector *errors = &output->errors; GumboParser parser = { ._options = &options }; GumboStringBuffer msg; VALUE rerrors = rb_ary_new2(errors->length); gumbo_string_buffer_init(&parser, &msg); for (int i=0; i < errors->length; i++) { GumboError *err = errors->data[i]; gumbo_string_buffer_clear(&parser, &msg); // Work around bug in gumbo_caret_diagnostic_to_string. // See https://github.com/google/gumbo-parser/pull/371 // The bug occurs when the error starts with a newline (unless it's the // first character in the input--but that shouldn't cause an error in // the first place. if (*err->original_text == '\n' && err->original_text != input) --err->original_text; gumbo_caret_diagnostic_to_string(&parser, err, input, &msg); VALUE err_str = rb_str_new(msg.data, msg.length); VALUE syntax_error = rb_class_new_instance(1, &err_str, XMLSyntaxError); rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR rb_iv_set(syntax_error, "@file", Qnil); rb_iv_set(syntax_error, "@line", INT2NUM(err->position.line)); rb_iv_set(syntax_error, "@str1", Qnil); rb_iv_set(syntax_error, "@str2", Qnil); rb_iv_set(syntax_error, "@str3", Qnil); rb_iv_set(syntax_error, "@int1", INT2NUM(err->type)); rb_iv_set(syntax_error, "@column", INT2NUM(err->position.column)); rb_ary_push(rerrors, syntax_error); } rb_iv_set(rdoc, "@errors", rerrors); gumbo_string_buffer_destroy(&parser, &msg); } gumbo_destroy_output(&options, output); return rdoc; }