hpricot_scan.rl

$Author: why $
$Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $

Copyright (C) 2006, 2010 why the lucky stiff

include <ruby.h> include <assert.h>

struct hpricot_struct {

int len;
VALUE* ptr;

};

ifndef RARRAY_LEN define RARRAY_LEN(arr) RARRAY(arr)->len define RSTRING_LEN(str) RSTRING(str)->len define RSTRING_PTR(str) RSTRING(str)->ptr endif

VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);

define NO_WAY_SERIOUSLY “*** This should not happen, please file a bug report with the HTML you're parsing at github.com/hpricot/hpricot/issues. So sorry!”

static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,

sym_cdata, sym_name, sym_parent, sym_raw_attributes, sym_raw_string, sym_tagno,
sym_allowed, sym_text, sym_children, sym_EMPTY, sym_CDATA;

static VALUE mHpricot, rb_eHpricotParseError; static VALUE cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cText,

cXMLDecl, cProcIns, symAllow, symDeny;

static ID s_ElementContent; static ID s_downcase, s_new, s_parent, s_read, s_to_str; static VALUE reProcInsParse;

define H_ELE_TAG 0 define H_ELE_PARENT 1 define H_ELE_ATTR 2 define H_ELE_ETAG 3 define H_ELE_RAW 4 define H_ELE_EC 5 define H_ELE_HASH 6 define H_ELE_CHILDREN 7

define HSTRUCT_PTR(ele) ((struct hpricot_struct*)DATA_PTR(ele))->ptr

define H_ELE_GET(ele, idx) HSTRUCT_PTR(ele) define H_ELE_SET(ele, idx, val) HSTRUCT_PTR(ele) = val

define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern(“” # key)))))

ifdef HAVE_RUBY_ENCODING_H include <ruby/encoding.h> # define ASSOCIATE_INDEX(s) rb_enc_associate_index((s), encoding_index) # define ENCODING_INDEX , encoding_index else # define ASSOCIATE_INDEX(s) # define ENCODING_INDEX endif

define ELE(N) \

if (te > ts || text == 1) {                                           \
  char *raw = NULL;                                                   \
  int rawlen = 0;                                                     \
  ele_open = 0; text = 0;                                             \
  if (ts != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
    raw = ts; rawlen = te - ts;                                       \
  }                                                                   \
  if (rb_block_given_p()) {                                           \
    VALUE raw_string = Qnil;                                          \
    if (raw != NULL) {                                                \
      raw_string = rb_str_new(raw, rawlen);                           \
      ASSOCIATE_INDEX(raw_string);                                    \
    }                                                                 \
    rb_yield_tokens(sym_##N, tag, attr, Qnil, taint);                 \
  } else                                                              \
    rb_hpricot_token(S, sym_##N, tag, attr, raw, rawlen, taint ENCODING_INDEX); \
}

define SET(N, E) \

if (mark_##N == NULL || E == mark_##N) {      \
  N = rb_str_new2("");                        \
  ASSOCIATE_INDEX(N);                         \
} else if (E > mark_##N) {                    \
  N = rb_str_new(mark_##N, E - mark_##N);     \
  ASSOCIATE_INDEX(N);                         \
}

define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }

define SLIDE(N) if (mark_##N > ts) mark_##N = buf + (mark_##N - ts);

define ATTR(K, V) \

if (!NIL_P(K)) { \
  if (NIL_P(attr)) attr = rb_hash_new(); \
  rb_hash_aset(attr, K, V); \
}

define TEXT_PASS() \

if (text == 0) \
{ \
  if (ele_open == 1) { \
    ele_open = 0; \
    if (ts > 0) { \
      mark_tag = ts; \
    } \
  } else { \
    mark_tag = p; \
  } \
  attr = Qnil; \
  tag = Qnil; \
  text = 1; \
}

define EBLK(N, T) CAT(tag, p - T + 1); ELE(N);

%%{

machine hpricot_scan;

action newEle {
  if (text == 1) {
    CAT(tag, p);
    ELE(text);
    text = 0;
  }
  attr = Qnil;
  tag = Qnil;
  mark_tag = NULL;
  ele_open = 1;
}

action _tag { mark_tag = p; }
action _aval { mark_aval = p; }
action _akey { mark_akey = p; }
action tag { SET(tag, p); }
action tagc { SET(tag, p-1); }
action aval { SET(aval, p); }
action aunq {
  if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
  else { SET(aval, p); }
}
action akey { SET(akey, p); }
action xmlver { SET(aval, p); ATTR(ID2SYM(rb_intern("version")), aval); }
action xmlenc {

ifdef HAVE_RUBY_ENCODING_H

if (mark_aval < p) {
  char psave = *p;
   p = '\0';
  encoding_index = rb_enc_find_index(mark_aval);
   p = psave;
}

endif

  SET(aval, p);
  ATTR(ID2SYM(rb_intern("encoding")), aval);
}
action xmlsd  { SET(aval, p); ATTR(ID2SYM(rb_intern("standalone")), aval); }
action pubid  { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
action sysid  { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }

action new_attr {
  akey = Qnil;
  aval = Qnil;
  mark_akey = NULL;
  mark_aval = NULL;
}

action save_attr {
  if (!S->xml && !NIL_P(akey))
    akey = rb_funcall(akey, s_downcase, 0);
  ATTR(akey, aval);
}

include hpricot_common "hpricot_common.rl";

}%%

%% write data nofinal;

define BUFSIZE 16384

void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint) {

VALUE ary;
if (sym == sym_text) {
  raw = tag;
}
ary = rb_ary_new3(4, sym, tag, attr, raw);
if (taint) {
  OBJ_TAINT(ary);
  OBJ_TAINT(tag);
  OBJ_TAINT(attr);
  OBJ_TAINT(raw);
}
rb_yield(ary);

}

ifndef RHASH_TBL

rb_hash_lookup() is only in Ruby 1.8.7

static VALUE our_rb_hash_lookup(VALUE hash, VALUE key) {

VALUE val;

if (!st_lookup(RHASH(hash)->tbl, key, &val)) {
  return Qnil; /* without Hash#default */
}

return val;

} define rb_hash_lookup our_rb_hash_lookup endif

static void rb_hpricot_add(VALUE focus, VALUE ele) {

VALUE children = H_ELE_GET(focus, H_ELE_CHILDREN);
if (NIL_P(children))
  H_ELE_SET(focus, H_ELE_CHILDREN, (children = rb_ary_new2(1)));
rb_ary_push(children, ele);
H_ELE_SET(ele, H_ELE_PARENT, focus);

}

typedef struct {

VALUE doc;
VALUE focus;
VALUE last;
VALUE EC;
unsigned char xml, strict, fixup;

} hpricot_state;

define H_PROP(prop, idx) \

static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
  H_ELE_SET(self, idx, x); \
  return self; \
} \
static VALUE hpricot_ele_clear_##prop(VALUE self) { \
  H_ELE_SET(self, idx, Qnil); \
  return Qtrue; \
} \
static VALUE hpricot_ele_get_##prop(VALUE self) { \
  return H_ELE_GET(self, idx); \
}

define H_ATTR(prop) \

static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
  rb_hash_aset(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop)), x); \
  return self; \
} \
static VALUE hpricot_ele_get_##prop(VALUE self) { \
  return rb_hash_aref(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop))); \
}

H_PROP(name, H_ELE_TAG); H_PROP(raw, H_ELE_RAW); H_PROP(parent, H_ELE_PARENT); H_PROP(attr, H_ELE_ATTR); H_PROP(etag, H_ELE_ETAG); H_PROP(children, H_ELE_CHILDREN); H_ATTR(target); H_ATTR(encoding); H_ATTR(version); H_ATTR(standalone); H_ATTR(system_id); H_ATTR(public_id);

define H_ELE(klass) \

ele = rb_obj_alloc(klass);                                            \
if (klass == cElem) {                                                 \
  H_ELE_SET(ele, H_ELE_TAG, tag);                                     \
  H_ELE_SET(ele, H_ELE_ATTR, attr);                                   \
  H_ELE_SET(ele, H_ELE_EC, ec);                                       \
  if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_doctype)) { \
    VALUE raw_str = rb_str_new(raw, rawlen);                          \
    ASSOCIATE_INDEX(raw_str);                                         \
    H_ELE_SET(ele, H_ELE_RAW, raw_str);                               \
  }                                                                   \
} else if (klass == cDocType || klass == cProcIns || klass == cXMLDecl || klass == cBogusETag) { \
  if (klass == cBogusETag) {                                          \
    H_ELE_SET(ele, H_ELE_TAG, tag);                                   \
    if (raw != NULL) {                                                \
      VALUE raw_str = rb_str_new(raw, rawlen);                        \
      ASSOCIATE_INDEX(raw_str);                                       \
      H_ELE_SET(ele, H_ELE_ATTR, raw_str);                            \
    }                                                                 \
  } else {                                                            \
    if (klass == cDocType)                                            \
      ATTR(ID2SYM(rb_intern("target")), tag);                         \
    H_ELE_SET(ele, H_ELE_ATTR, attr);                                 \
    if (klass != cProcIns) {                                          \
      tag = Qnil;                                                     \
      if (raw != NULL) {                                              \
        tag = rb_str_new(raw, rawlen);                                \
        ASSOCIATE_INDEX(tag);                                         \
      }                                                               \
    }                                                                 \
    H_ELE_SET(ele, H_ELE_TAG, tag);                                   \
  }                                                                   \
} else {                                                              \
  H_ELE_SET(ele, H_ELE_TAG, tag);                                     \
}                                                                     \
S->last = ele

// // the swift, compact parser logic. most of the complicated stuff is done // in the lexer. this step just pairs up the start and end tags. // void rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr,

char *raw, int rawlen, int taint

ifdef HAVE_RUBY_ENCODING_H

, int encoding_index

endif ) {

VALUE ele, ec = Qnil;

//
// in html mode, fix up start tags incorrectly formed as empty tags
//
if (!S->xml) {
  if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
    ec = rb_hash_aref(S->EC, tag);
    if (NIL_P(ec)) {
      tag = rb_funcall(tag, s_downcase, 0);
      ec = rb_hash_aref(S->EC, tag);
    }
  }

  if (H_ELE_GET(S->focus, H_ELE_EC) == sym_CDATA &&
     (sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
    !(sym == sym_etag && INT2FIX(rb_str_hash(tag)) == H_ELE_GET(S->focus, H_ELE_HASH)))
  {
    sym = sym_text;
    tag = rb_str_new(raw, rawlen);
    ASSOCIATE_INDEX(tag);
  }

  if (!NIL_P(ec)) {
    if (sym == sym_emptytag) {
      if (ec != sym_EMPTY)
        sym = sym_stag;
    } else if (sym == sym_stag) {
      if (ec == sym_EMPTY)
        sym = sym_emptytag;
    }
  }
}

if (sym == sym_emptytag || sym == sym_stag) {
  VALUE name = INT2FIX(rb_str_hash(tag));
  H_ELE(cElem);
  H_ELE_SET(ele, H_ELE_HASH, name);

  if (!S->xml) {
    VALUE match = Qnil, e = S->focus;
    while (e != S->doc)
    {
      if (ec == Qnil) {
        // anything can contain unknown elements
        if (match == Qnil)
          match = e;
      } else {
        VALUE hEC = H_ELE_GET(e, H_ELE_EC);

        if (TYPE(hEC) == T_HASH)
        {
          VALUE has = rb_hash_lookup(hEC, name);
          if (has != Qnil) {
            if (has == Qtrue) {
              if (match == Qnil)
                match = e;
            } else if (has == symAllow) {
              match = S->focus;
            } else if (has == symDeny) {
              match = Qnil;
            }
          }
        } else {
          // Unknown elements can contain anything
          if (match == Qnil)
            match = e;
        }
      }
      e = H_ELE_GET(e, H_ELE_PARENT);
    }

    if (match == Qnil)
      match = S->focus;
    S->focus = match;
  }

  rb_hpricot_add(S->focus, ele);

  //
  // in the case of a start tag that should be empty, just
  // skip the step that focuses the element.  focusing moves
  // us deeper into the document.
  //
  if (sym == sym_stag) {
    if (S->xml || ec != sym_EMPTY) {
      S->focus = ele;
      S->last = Qnil;
    }
  }
} else if (sym == sym_etag) {
  VALUE name, match = Qnil, e = S->focus;
  if (S->strict) {
    if (NIL_P(rb_hash_aref(S->EC, tag))) {
      tag = rb_str_new2("div");
      ASSOCIATE_INDEX(tag);
    }
  }

  //
  // another optimization will be to improve this very simple
  // O(n) tag search, where n is the depth of the focused tag.
  //
  // (see also: the search above for fixups)
  //
  name = INT2FIX(rb_str_hash(tag));
  while (e != S->doc)
  {
    if (H_ELE_GET(e, H_ELE_HASH) == name)
    {
      match = e;
      break;
    }

    e = H_ELE_GET(e, H_ELE_PARENT);
  }

  if (NIL_P(match))
  {
    H_ELE(cBogusETag);
    rb_hpricot_add(S->focus, ele);
  }
  else
  {
    VALUE ele = Qnil;
    if (raw != NULL) {
      ele = rb_str_new(raw, rawlen);
      ASSOCIATE_INDEX(ele);
    }
    H_ELE_SET(match, H_ELE_ETAG, ele);
    S->focus = H_ELE_GET(match, H_ELE_PARENT);
    S->last = Qnil;
  }
} else if (sym == sym_cdata) {
  H_ELE(cCData);
  rb_hpricot_add(S->focus, ele);
} else if (sym == sym_comment) {
  H_ELE(cComment);
  rb_hpricot_add(S->focus, ele);
} else if (sym == sym_doctype) {
  H_ELE(cDocType);
  if (S->strict) {
    VALUE id;
    id = rb_str_new2("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
    ASSOCIATE_INDEX(id);
    rb_hash_aset(attr, ID2SYM(rb_intern("system_id")), id);
    id = rb_str_new2("-//W3C//DTD XHTML 1.0 Strict//EN");
    ASSOCIATE_INDEX(id);
    rb_hash_aset(attr, ID2SYM(rb_intern("public_id")), id);
  }
  rb_hpricot_add(S->focus, ele);
} else if (sym == sym_procins) {
  VALUE match = rb_funcall(tag, rb_intern("match"), 1, reProcInsParse);
  tag = rb_reg_nth_match(1, match);
  attr = rb_reg_nth_match(2, match);
  {
    H_ELE(cProcIns);
    rb_hpricot_add(S->focus, ele);
  }
} else if (sym == sym_text) {
  // TODO: add raw_string as well?
  if (!NIL_P(S->last) && RTEST(rb_obj_is_instance_of(S->last, cText))) {
    rb_str_append(H_ELE_GET(S->last, H_ELE_TAG), tag);
  } else {
    H_ELE(cText);
    rb_hpricot_add(S->focus, ele);
  }
} else if (sym == sym_xmldecl) {
  H_ELE(cXMLDecl);
  rb_hpricot_add(S->focus, ele);
}

}

VALUE hpricot_scan(int argc, VALUE *argv, VALUE self) {

int cs, act, have = 0, nread = 0, curline = 1, text = 0, io = 0;
char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;

hpricot_state *S = NULL;
VALUE port, opts;
VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
int done = 0, ele_open = 0, buffer_size = 0, taint = 0;

ifdef HAVE_RUBY_ENCODING_H

int encoding_index = rb_enc_to_index(rb_default_external_encoding());

endif

rb_scan_args(argc, argv, "11", &port, &opts);
taint = OBJ_TAINTED(port);
io = rb_respond_to(port, s_read);
if (!io)
{
  if (rb_respond_to(port, s_to_str))
  {
    port = rb_funcall(port, s_to_str, 0);
    StringValue(port);
  }
  else
  {
    rb_raise(rb_eArgError, "an Hpricot document must be built from an input source (a String or IO object.)");
  }
}

if (TYPE(opts) != T_HASH)
  opts = Qnil;

if (!rb_block_given_p())
{
  S = ALLOC(hpricot_state);
  S->doc = rb_obj_alloc(cDoc);
  rb_gc_register_address(&S->doc);
  S->focus = S->doc;
  S->last = Qnil;
  S->xml = OPT(opts, xml);
  S->strict = OPT(opts, xhtml_strict);
  S->fixup = OPT(opts, fixup_tags);
  if (S->strict) S->fixup = 1;
  rb_ivar_set(S->doc, rb_intern("@options"), opts);

  S->EC = rb_const_get(mHpricot, s_ElementContent);
}

buffer_size = BUFSIZE;
if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
  bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
  if (!NIL_P(bufsize)) {
    buffer_size = NUM2INT(bufsize);
  }
}

if (io)
  buf = ALLOC_N(char, buffer_size);

%% write init;

while (!done) {
  VALUE str;
  char *p, *pe;
  int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;

  if (io)
  {
    if (space == 0) {
      /* We've used up the entire buffer storing an already-parsed token
         prefix that must be preserved.  Likely caused by super-long attributes.
         Increase buffer size and continue  */
       tokstart_diff = ts - buf;
       tokend_diff = te - buf;
       mark_tag_diff = mark_tag - buf;
       mark_akey_diff = mark_akey - buf;
       mark_aval_diff = mark_aval - buf;

       buffer_size += BUFSIZE;
       REALLOC_N(buf, char, buffer_size);

       space = buffer_size - have;

       ts = buf + tokstart_diff;
       te = buf + tokend_diff;
       mark_tag = buf + mark_tag_diff;
       mark_akey = buf + mark_akey_diff;
       mark_aval = buf + mark_aval_diff;
    }
    p = buf + have;

    str = rb_funcall(port, s_read, 1, INT2FIX(space));
    len = RSTRING_LEN(str);
    memcpy(p, StringValuePtr(str), len);
  }
  else
  {
    p = RSTRING_PTR(port);
    len = RSTRING_LEN(port) + 1;
    done = 1;
  }

  nread += len;

  /* If this is the last buffer, tack on an EOF. */
  if (io && len < space) {
    p[len++] = 0;
    done = 1;
  }

  pe = p + len;
  %% write exec;

  if (cs == hpricot_scan_error) {
    if (buf != NULL)
      free(buf);
    if (!NIL_P(tag))
    {
      rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
    }
    else
    {
      rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
    }
  }

  if (done && ele_open)
  {
    ele_open = 0;
    if (ts > 0) {
      mark_tag = ts;
      ts = 0;
      text = 1;
    }
  }

  if (ts == 0)
  {
    have = 0;
    /* text nodes have no ts because each byte is parsed alone */
    if (mark_tag != NULL && text == 1)
    {
      if (done)
      {
        if (mark_tag < p-1)
        {
          CAT(tag, p-1);
          ELE(text);
        }
      }
      else
      {
        CAT(tag, p);
      }
    }
    if (io)
      mark_tag = buf;
    else
      mark_tag = RSTRING_PTR(port);
  }
  else if (io)
  {
    have = pe - ts;
    memmove(buf, ts, have);
    SLIDE(tag);
    SLIDE(akey);
    SLIDE(aval);
    te = buf + (te - ts);
    ts = buf;
  }
}

if (buf != NULL)
  free(buf);

if (S != NULL)
{
  VALUE doc = S->doc;
  rb_gc_unregister_address(&S->doc);
  free(S);
  return doc;
}

return Qnil;

}

void hstruct_mark(void* ptr) {

struct hpricot_struct* st = (struct hpricot_struct*)ptr;
int i;

/* it's likely to hit GC when allocating st->ptr.
   that should be checked to avoid segfault.
   and simply ignore it.
  /
if (st->ptr) {
  for(i = 0; i < st->len; i++) {
    rb_gc_mark(st->ptr[i]);
  }
}

}

void hstruct_free(void* ptr) {

struct hpricot_struct* st = (struct hpricot_struct*)ptr;

free(st->ptr);
free(st);

}

static VALUE alloc_hpricot_struct8(VALUE klass) {

VALUE obj;
struct hpricot_struct* st;

obj = Data_Make_Struct(klass, struct hpricot_struct, hstruct_mark, hstruct_free, st);

st->len = 8;
st->ptr = ALLOC_N(VALUE, 8);

rb_mem_clear(st->ptr, 8);

return obj;

}

static VALUE alloc_hpricot_struct2(VALUE klass) {

VALUE obj;
struct hpricot_struct* st;

obj = Data_Make_Struct(klass, struct hpricot_struct, hstruct_mark, hstruct_free, st);

st->len = 2;
st->ptr = ALLOC_N(VALUE, 2);

rb_mem_clear(st->ptr, 2);

return obj;

}

static VALUE alloc_hpricot_struct3(VALUE klass) {

VALUE obj;
struct hpricot_struct* st;

obj = Data_Make_Struct(klass, struct hpricot_struct, hstruct_mark, hstruct_free, st);

st->len = 3;
st->ptr = ALLOC_N(VALUE, 3);

rb_mem_clear(st->ptr, 3);

return obj;

}

static VALUE hpricot_struct_ref0(VALUE obj) {return H_ELE_GET(obj, 0);} static VALUE hpricot_struct_ref1(VALUE obj) {return H_ELE_GET(obj, 1);} static VALUE hpricot_struct_ref2(VALUE obj) {return H_ELE_GET(obj, 2);} static VALUE hpricot_struct_ref3(VALUE obj) {return H_ELE_GET(obj, 3);} static VALUE hpricot_struct_ref4(VALUE obj) {return H_ELE_GET(obj, 4);} static VALUE hpricot_struct_ref5(VALUE obj) {return H_ELE_GET(obj, 5);} static VALUE hpricot_struct_ref6(VALUE obj) {return H_ELE_GET(obj, 6);} static VALUE hpricot_struct_ref7(VALUE obj) {return H_ELE_GET(obj, 7);} static VALUE hpricot_struct_ref8(VALUE obj) {return H_ELE_GET(obj, 8);} static VALUE hpricot_struct_ref9(VALUE obj) {return H_ELE_GET(obj, 9);}

static VALUE (*ref_func)() = {

hpricot_struct_ref0,
hpricot_struct_ref1,
hpricot_struct_ref2,
hpricot_struct_ref3,
hpricot_struct_ref4,
hpricot_struct_ref5,
hpricot_struct_ref6,
hpricot_struct_ref7,
hpricot_struct_ref8,
hpricot_struct_ref9,

};

static VALUE hpricot_struct_set0(VALUE obj, VALUE val) {return H_ELE_SET(obj, 0, val);} static VALUE hpricot_struct_set1(VALUE obj, VALUE val) {return H_ELE_SET(obj, 1, val);} static VALUE hpricot_struct_set2(VALUE obj, VALUE val) {return H_ELE_SET(obj, 2, val);} static VALUE hpricot_struct_set3(VALUE obj, VALUE val) {return H_ELE_SET(obj, 3, val);} static VALUE hpricot_struct_set4(VALUE obj, VALUE val) {return H_ELE_SET(obj, 4, val);} static VALUE hpricot_struct_set5(VALUE obj, VALUE val) {return H_ELE_SET(obj, 5, val);} static VALUE hpricot_struct_set6(VALUE obj, VALUE val) {return H_ELE_SET(obj, 6, val);} static VALUE hpricot_struct_set7(VALUE obj, VALUE val) {return H_ELE_SET(obj, 7, val);} static VALUE hpricot_struct_set8(VALUE obj, VALUE val) {return H_ELE_SET(obj, 8, val);} static VALUE hpricot_struct_set9(VALUE obj, VALUE val) {return H_ELE_SET(obj, 9, val);}

static VALUE (*set_func)() = {

hpricot_struct_set0,
hpricot_struct_set1,
hpricot_struct_set2,
hpricot_struct_set3,
hpricot_struct_set4,
hpricot_struct_set5,
hpricot_struct_set6,
hpricot_struct_set7,
hpricot_struct_set8,
hpricot_struct_set9,

};

static VALUE make_hpricot_struct(VALUE members, VALUE (*alloc)(VALUE klass)) {

int i = 0;
char attr_set[128];

VALUE klass = rb_class_new(rb_cObject);
rb_define_alloc_func(klass, alloc);

int len = RARRAY_LEN(members);
assert(len < 10);

for (i = 0; i < len; i++) {
  ID id = SYM2ID(rb_ary_entry(members, i));
  const char* name = rb_id2name(id);
  int len = strlen(name);

  memcpy(attr_set, name, strlen(name));
  attr_set[len] = '=';
  attr_set[len+1] = 0;

  rb_define_method(klass, name, ref_func[i], 0);
  rb_define_method(klass, attr_set, set_func[i], 1);
}
return klass;

}

void Init_hpricot_scan() {

VALUE structElem, structAttr, structBasic;

s_ElementContent = rb_intern("ElementContent");
symAllow = ID2SYM(rb_intern("allow"));
symDeny = ID2SYM(rb_intern("deny"));
s_downcase = rb_intern("downcase");
s_new = rb_intern("new");
s_parent = rb_intern("parent");
s_read = rb_intern("read");
s_to_str = rb_intern("to_str");
sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
sym_doctype = ID2SYM(rb_intern("doctype"));
sym_procins = ID2SYM(rb_intern("procins"));
sym_stag = ID2SYM(rb_intern("stag"));
sym_etag = ID2SYM(rb_intern("etag"));
sym_emptytag = ID2SYM(rb_intern("emptytag"));
sym_allowed = ID2SYM(rb_intern("allowed"));
sym_children = ID2SYM(rb_intern("children"));
sym_comment = ID2SYM(rb_intern("comment"));
sym_cdata = ID2SYM(rb_intern("cdata"));
sym_name = ID2SYM(rb_intern("name"));
sym_parent = ID2SYM(rb_intern("parent"));
sym_raw_attributes = ID2SYM(rb_intern("raw_attributes"));
sym_raw_string = ID2SYM(rb_intern("raw_string"));
sym_tagno = ID2SYM(rb_intern("tagno"));
sym_text = ID2SYM(rb_intern("text"));
sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
sym_CDATA = ID2SYM(rb_intern("CDATA"));

mHpricot = rb_define_module("Hpricot");
rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);

structElem = make_hpricot_struct(rb_ary_new3(8, sym_name, sym_parent,
  sym_raw_attributes, sym_etag, sym_raw_string, sym_allowed,
  sym_tagno, sym_children), alloc_hpricot_struct8);
structAttr = make_hpricot_struct(
    rb_ary_new3(3, sym_name, sym_parent, sym_raw_attributes),
    alloc_hpricot_struct3);
structBasic = make_hpricot_struct(
    rb_ary_new3(2, sym_name, sym_parent),
    alloc_hpricot_struct2);

cDoc = rb_define_class_under(mHpricot, "Doc", structElem);
cCData = rb_define_class_under(mHpricot, "CData", structBasic);
rb_define_method(cCData, "content", hpricot_ele_get_name, 0);
rb_define_method(cCData, "content=", hpricot_ele_set_name, 1);
cComment = rb_define_class_under(mHpricot, "Comment", structBasic);
rb_define_method(cComment, "content", hpricot_ele_get_name, 0);
rb_define_method(cComment, "content=", hpricot_ele_set_name, 1);
cDocType = rb_define_class_under(mHpricot, "DocType", structAttr);
rb_define_method(cDocType, "raw_string", hpricot_ele_get_name, 0);
rb_define_method(cDocType, "clear_raw", hpricot_ele_clear_name, 0);
rb_define_method(cDocType, "target", hpricot_ele_get_target, 0);
rb_define_method(cDocType, "target=", hpricot_ele_set_target, 1);
rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
cElem = rb_define_class_under(mHpricot, "Elem", structElem);
rb_define_method(cElem, "clear_raw", hpricot_ele_clear_raw, 0);
cBogusETag = rb_define_class_under(mHpricot, "BogusETag", structAttr);
rb_define_method(cBogusETag, "raw_string", hpricot_ele_get_attr, 0);
rb_define_method(cBogusETag, "clear_raw", hpricot_ele_clear_attr, 0);
cText = rb_define_class_under(mHpricot, "Text", structBasic);
rb_define_method(cText, "raw_string", hpricot_ele_get_name, 0);
rb_define_method(cText, "clear_raw", hpricot_ele_clear_name, 0);
rb_define_method(cText, "content", hpricot_ele_get_name, 0);
rb_define_method(cText, "content=", hpricot_ele_set_name, 1);
cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", structAttr);
rb_define_method(cXMLDecl, "raw_string", hpricot_ele_get_name, 0);
rb_define_method(cXMLDecl, "clear_raw", hpricot_ele_clear_name, 0);
rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
cProcIns = rb_define_class_under(mHpricot, "ProcIns", structAttr);
rb_define_method(cProcIns, "target", hpricot_ele_get_name, 0);
rb_define_method(cProcIns, "target=", hpricot_ele_set_name, 1);
rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);

rb_const_set(mHpricot, rb_intern("ProcInsParse"),
  reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));

}