360 lines
8.4 KiB
C
360 lines
8.4 KiB
C
|
/*
|
||
|
* This file is part of LibDOM.
|
||
|
* Licensed under the MIT License,
|
||
|
* http://www.opensource.org/licenses/mit-license.php
|
||
|
*
|
||
|
* Copyright 2010 - 2011 Michael Drake <tlsa@netsurf-browser.org>
|
||
|
*/
|
||
|
|
||
|
/*
|
||
|
* Load an HTML file into LibDOM with Hubbub and print out the DOM structure.
|
||
|
*
|
||
|
* This example demonstrates the following:
|
||
|
*
|
||
|
* 1. Using LibDOM's Hubbub binding to read an HTML file into LibDOM.
|
||
|
* 2. Walking around the DOM tree.
|
||
|
* 3. Accessing DOM node attributes.
|
||
|
*
|
||
|
* Example input:
|
||
|
* <html><body><h1 class="woo">NetSurf</h1>
|
||
|
* <p>NetSurf is <em>awesome</em>!</p>
|
||
|
* <div><h2>Hubbub</h2><p>Hubbub is too.</p>
|
||
|
* <p>Big time.</p></div></body></html>
|
||
|
*
|
||
|
* Example output:
|
||
|
*
|
||
|
* HTML
|
||
|
* +-BODY
|
||
|
* | +-H1 class="woo"
|
||
|
* | +-P
|
||
|
* | | +-EM
|
||
|
* | +-DIV
|
||
|
* | | +-H2
|
||
|
* | | +-P
|
||
|
* | | +-P
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
#include <assert.h>
|
||
|
#include <stdarg.h>
|
||
|
#include <stdbool.h>
|
||
|
#include <stdint.h>
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <string.h>
|
||
|
|
||
|
#include <dom/dom.h>
|
||
|
#include <dom/bindings/hubbub/parser.h>
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Generate a LibDOM document DOM from an HTML file
|
||
|
*
|
||
|
* \param file The file path
|
||
|
* \return pointer to DOM document, or NULL on error
|
||
|
*/
|
||
|
dom_document *create_doc_dom_from_file(char *file)
|
||
|
{
|
||
|
size_t buffer_size = 1024;
|
||
|
dom_hubbub_parser *parser = NULL;
|
||
|
FILE *handle;
|
||
|
int chunk_length;
|
||
|
dom_hubbub_error error;
|
||
|
dom_hubbub_parser_params params;
|
||
|
dom_document *doc;
|
||
|
unsigned char buffer[buffer_size];
|
||
|
|
||
|
params.enc = NULL;
|
||
|
params.fix_enc = true;
|
||
|
params.enable_script = false;
|
||
|
params.msg = NULL;
|
||
|
params.script = NULL;
|
||
|
params.ctx = NULL;
|
||
|
params.daf = NULL;
|
||
|
|
||
|
/* Create Hubbub parser */
|
||
|
error = dom_hubbub_parser_create(¶ms, &parser, &doc);
|
||
|
if (error != DOM_HUBBUB_OK) {
|
||
|
printf("Can't create Hubbub Parser\n");
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
/* Open input file */
|
||
|
handle = fopen(file, "rb");
|
||
|
if (handle == NULL) {
|
||
|
dom_hubbub_parser_destroy(parser);
|
||
|
printf("Can't open test input file: %s\n", file);
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
/* Parse input file in chunks */
|
||
|
chunk_length = buffer_size;
|
||
|
while (chunk_length == buffer_size) {
|
||
|
chunk_length = fread(buffer, 1, buffer_size, handle);
|
||
|
error = dom_hubbub_parser_parse_chunk(parser, buffer,
|
||
|
chunk_length);
|
||
|
if (error != DOM_HUBBUB_OK) {
|
||
|
dom_hubbub_parser_destroy(parser);
|
||
|
printf("Parsing errors occur\n");
|
||
|
return NULL;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* Done parsing file */
|
||
|
error = dom_hubbub_parser_completed(parser);
|
||
|
if (error != DOM_HUBBUB_OK) {
|
||
|
dom_hubbub_parser_destroy(parser);
|
||
|
printf("Parsing error when construct DOM\n");
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
/* Finished with parser */
|
||
|
dom_hubbub_parser_destroy(parser);
|
||
|
|
||
|
/* Close input file */
|
||
|
if (fclose(handle) != 0) {
|
||
|
printf("Can't close test input file: %s\n", file);
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
return doc;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Dump attribute/value for an element node
|
||
|
*
|
||
|
* \param node The element node to dump attribute details for
|
||
|
* \param attribute The attribute to dump
|
||
|
* \return true on success, or false on error
|
||
|
*/
|
||
|
bool dump_dom_element_attribute(dom_node *node, char *attribute)
|
||
|
{
|
||
|
dom_exception exc;
|
||
|
dom_string *attr = NULL;
|
||
|
dom_string *attr_value = NULL;
|
||
|
dom_node_type type;
|
||
|
const char *string;
|
||
|
size_t length;
|
||
|
|
||
|
/* Should only have element nodes here */
|
||
|
exc = dom_node_get_node_type(node, &type);
|
||
|
if (exc != DOM_NO_ERR) {
|
||
|
printf(" Exception raised for node_get_node_type\n");
|
||
|
return false;
|
||
|
}
|
||
|
assert(type == DOM_ELEMENT_NODE);
|
||
|
|
||
|
/* Create a dom_string containing required attribute name. */
|
||
|
exc = dom_string_create_interned((uint8_t *)attribute,
|
||
|
strlen(attribute), &attr);
|
||
|
if (exc != DOM_NO_ERR) {
|
||
|
printf(" Exception raised for dom_string_create\n");
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/* Get class attribute's value */
|
||
|
exc = dom_element_get_attribute(node, attr, &attr_value);
|
||
|
if (exc != DOM_NO_ERR) {
|
||
|
printf(" Exception raised for element_get_attribute\n");
|
||
|
dom_string_unref(attr);
|
||
|
return false;
|
||
|
} else if (attr_value == NULL) {
|
||
|
/* Element lacks required attribute */
|
||
|
dom_string_unref(attr);
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
/* Finished with the attr dom_string */
|
||
|
dom_string_unref(attr);
|
||
|
|
||
|
/* Get attribute value's string data */
|
||
|
string = dom_string_data(attr_value);
|
||
|
length = dom_string_byte_length(attr_value);
|
||
|
|
||
|
/* Print attribute info */
|
||
|
printf(" %s=\"%.*s\"", attribute, (int)length, string);
|
||
|
|
||
|
/* Finished with the attr_value dom_string */
|
||
|
dom_string_unref(attr_value);
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Print a line in a DOM structure dump for an element
|
||
|
*
|
||
|
* \param node The node to dump
|
||
|
* \param depth The node's depth
|
||
|
* \return true on success, or false on error
|
||
|
*/
|
||
|
bool dump_dom_element(dom_node *node, int depth)
|
||
|
{
|
||
|
dom_exception exc;
|
||
|
dom_string *node_name = NULL;
|
||
|
dom_node_type type;
|
||
|
int i;
|
||
|
const char *string;
|
||
|
size_t length;
|
||
|
|
||
|
/* Only interested in element nodes */
|
||
|
exc = dom_node_get_node_type(node, &type);
|
||
|
if (exc != DOM_NO_ERR) {
|
||
|
printf("Exception raised for node_get_node_type\n");
|
||
|
return false;
|
||
|
} else if (type != DOM_ELEMENT_NODE) {
|
||
|
/* Nothing to print */
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
/* Get element name */
|
||
|
exc = dom_node_get_node_name(node, &node_name);
|
||
|
if (exc != DOM_NO_ERR) {
|
||
|
printf("Exception raised for get_node_name\n");
|
||
|
return false;
|
||
|
} else if (node_name == NULL) {
|
||
|
printf("Broken: root_name == NULL\n");
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/* Print ASCII tree structure for current node */
|
||
|
if (depth > 0) {
|
||
|
for (i = 0; i < depth; i++) {
|
||
|
printf("| ");
|
||
|
}
|
||
|
printf("+-");
|
||
|
}
|
||
|
|
||
|
/* Get string data and print element name */
|
||
|
string = dom_string_data(node_name);
|
||
|
length = dom_string_byte_length(node_name);
|
||
|
printf("[%.*s]", (int)length, string);
|
||
|
|
||
|
if (length == 5 && strncmp(string, "title", 5) == 0) {
|
||
|
/* Title tag, gather the title */
|
||
|
dom_string *str;
|
||
|
exc = dom_node_get_text_content(node, &str);
|
||
|
if (exc == DOM_NO_ERR && str != NULL) {
|
||
|
printf(" $%.*s$", (int)dom_string_byte_length(str),
|
||
|
dom_string_data(str));
|
||
|
dom_string_unref(str);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* Finished with the node_name dom_string */
|
||
|
dom_string_unref(node_name);
|
||
|
|
||
|
/* Print the element's id & class, if it has them */
|
||
|
if (dump_dom_element_attribute(node, "id") == false ||
|
||
|
dump_dom_element_attribute(node, "class") == false) {
|
||
|
/* Error occured */
|
||
|
printf("\n");
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
printf("\n");
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Walk though a DOM (sub)tree, in depth first order, printing DOM structure.
|
||
|
*
|
||
|
* \param node The root node to start from
|
||
|
* \param depth The depth of 'node' in the (sub)tree
|
||
|
*/
|
||
|
bool dump_dom_structure(dom_node *node, int depth)
|
||
|
{
|
||
|
dom_exception exc;
|
||
|
dom_node *child;
|
||
|
|
||
|
/* Print this node's entry */
|
||
|
if (dump_dom_element(node, depth) == false) {
|
||
|
/* There was an error; return */
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/* Get the node's first child */
|
||
|
exc = dom_node_get_first_child(node, &child);
|
||
|
if (exc != DOM_NO_ERR) {
|
||
|
printf("Exception raised for node_get_first_child\n");
|
||
|
return false;
|
||
|
} else if (child != NULL) {
|
||
|
/* node has children; decend to children's depth */
|
||
|
depth++;
|
||
|
|
||
|
/* Loop though all node's children */
|
||
|
do {
|
||
|
dom_node *next_child;
|
||
|
|
||
|
/* Visit node's descendents */
|
||
|
if (dump_dom_structure(child, depth) == false) {
|
||
|
/* There was an error; return */
|
||
|
dom_node_unref(child);
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/* Go to next sibling */
|
||
|
exc = dom_node_get_next_sibling(child, &next_child);
|
||
|
if (exc != DOM_NO_ERR) {
|
||
|
printf("Exception raised for "
|
||
|
"node_get_next_sibling\n");
|
||
|
dom_node_unref(child);
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
dom_node_unref(child);
|
||
|
child = next_child;
|
||
|
} while (child != NULL); /* No more children */
|
||
|
}
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Main entry point from OS.
|
||
|
*/
|
||
|
int main(int argc, char **argv)
|
||
|
{
|
||
|
dom_exception exc; /* returned by libdom functions */
|
||
|
dom_document *doc = NULL; /* document, loaded into libdom */
|
||
|
dom_node *root = NULL; /* root element of document */
|
||
|
|
||
|
/* Load up the input HTML file */
|
||
|
doc = create_doc_dom_from_file((argc > 1) ? (argv[1]) : "files/test.html");
|
||
|
if (doc == NULL) {
|
||
|
printf("Failed to load document.\n");
|
||
|
return EXIT_FAILURE;
|
||
|
}
|
||
|
|
||
|
/* Get root element */
|
||
|
exc = dom_document_get_document_element(doc, &root);
|
||
|
if (exc != DOM_NO_ERR) {
|
||
|
printf("Exception raised for get_document_element\n");
|
||
|
dom_node_unref(doc);
|
||
|
return EXIT_FAILURE;
|
||
|
} else if (root == NULL) {
|
||
|
printf("Broken: root == NULL\n");
|
||
|
dom_node_unref(doc);
|
||
|
return EXIT_FAILURE;
|
||
|
}
|
||
|
|
||
|
/* Dump DOM structure */
|
||
|
if (dump_dom_structure(root, 0) == false) {
|
||
|
printf("Failed to complete DOM structure dump.\n");
|
||
|
dom_node_unref(root);
|
||
|
dom_node_unref(doc);
|
||
|
return EXIT_FAILURE;
|
||
|
}
|
||
|
|
||
|
dom_node_unref(root);
|
||
|
|
||
|
/* Finished with the dom_document */
|
||
|
dom_node_unref(doc);
|
||
|
|
||
|
return EXIT_SUCCESS;
|
||
|
}
|
||
|
|