Logo Search packages:      
Sourcecode: afnix version File versions

Regex.cpp

// ---------------------------------------------------------------------------
// - Regex.cpp                                                               -
// - standard object library - regex class implementation                    -
// ---------------------------------------------------------------------------
// - This program is free software;  you can redistribute it  and/or  modify -
// - it provided that this copyright notice is kept intact.                  -
// -                                                                         -
// - This program  is  distributed in  the hope  that it will be useful, but -
// - without  any  warranty;  without  even   the   implied    warranty   of -
// - merchantability or fitness for a particular purpose.  In no event shall -
// - the copyright holder be liable for any  direct, indirect, incidental or -
// - special damages arising in any way out of the use of this software.     -
// ---------------------------------------------------------------------------
// - copyright (c) 1999-2007 amaury darsch                                   -
// ---------------------------------------------------------------------------

#include "Real.hpp"
#include "Regex.hpp"
#include "Stdsid.hxx"
#include "Vector.hpp"
#include "Boolean.hpp"
#include "Unicode.hpp"
#include "Runnable.hpp"
#include "Unitabler.hpp"
#include "QuarkZone.hpp"
#include "Exception.hpp"
#include "InputString.hpp"
#include "OutputTerm.hpp"

namespace afnix {

  // -------------------------------------------------------------------------
  // - private section                                                       -
  // -------------------------------------------------------------------------

  // check for a blank character (blank or tab)
  static inline bool re_check_blank (const t_quad c) {
    if ((c == blkq) || (c == tabq)) return true;
    return false;
  }

  // check for a newline/cariage return or end of file
  static inline bool re_check_eos (const t_quad c) {
    if ((c == eolq) || (c == eofq) || (c == crlq)) return true;
    return false;
  }

  // check for a newline or cariage return
  static inline bool re_check_newln (const t_quad c) {
    if ((c == eolq) || (c == crlq)) return true;
    return false;
  }

  // check for a alpha-numeric character
  static inline bool re_check_alpha (const t_quad c) {
    return Unicode::isalpha (c);
  }

  // check for a digit
  static inline bool re_check_digit (const t_quad c) {
    return Unicode::isdigit (c);
  }

  // check for a lower character
  static inline bool re_check_lower (const t_quad c) {
    return  Unicode::islower (c);
  }

  // check for a upper character
  static inline bool re_check_upper (const t_quad c) {
    return Unicode::isupper (c);
  }
  // check for a letter character
  static inline bool re_check_letter (const t_quad c) {
    return Unicode::isletter (c);
  }

  // check for a hexadecimal character
  static inline bool re_check_hexa (const t_quad c) {
    return Unicode::ishexa (c);
  }

  // check for a afnix constituent
  static inline bool re_check_afnix (const char c) {
    return Unicode::isafnix (c);
  }

  // check a meta character against a character
  static bool re_check_meta (const t_quad meta, const t_quad c) {
    switch (meta) {
    case 'a':
      return  re_check_alpha (c);
      break;
    case 'A':
      return !re_check_alpha (c);
      break;
    case 'b':
      return  re_check_blank (c);
      break;
    case 'B':
      return !re_check_blank (c);
      break;
    case 'd':
      return  re_check_digit (c);
      break;
    case 'D':
      return !re_check_digit (c);
      break;
    case 'e':
      return  re_check_eos (c);
      break;
    case 'E':
      return !re_check_eos (c);
      break;
    case 'l':
      return  re_check_lower (c);
      break;
    case 'L':
      return !re_check_lower (c);
      break;
    case 'n':
      return  re_check_newln (c);
      break;
    case 'N':
      return !re_check_newln (c);
      break;
    case 's':
      return  re_check_letter (c);
      break;
    case 'S':
      return !re_check_letter (c);
      break;
    case 'u':
      return  re_check_upper (c);
      break;
    case 'U':
      return !re_check_upper (c);
      break;
    case 'x':
      return  re_check_hexa (c);
      break;
    case 'X':
      return !re_check_hexa (c);
      break;
    case 'w':
      return  re_check_afnix (c);
      break;
    case 'W':
      return !re_check_afnix (c);
      break;
    default:
      return (meta == c);
      break;
    }
    return false;
  }

  // the regex character set structure
  struct s_recset {
    // unicode table
    Unitabler* p_utbl;
    // meta character
    t_quad d_meta;
    // inverted set flag
    bool   d_iflg;
    // next set in list
    s_recset* p_next;
    // simple constructor
    s_recset (void) {
      p_utbl = nilp;
      d_meta = nilq;
      d_iflg = false;
      p_next = nilp;
    }
    // simple destructor
    ~s_recset (void) {
      delete p_utbl;
      delete p_next;
    }
    // add a meta character to this set
    void meta (const t_quad c) {
      // check this node first
      if (d_meta == nilq) {
      d_meta = c;
      return;
      }
      // find the last character set
      s_recset* cset = this;
      while (cset->p_next != nilp) cset = cset->p_next;
      // create a new node and link
      s_recset* node = new s_recset;
      node->d_meta   = c;
      cset->p_next   = node;
    }
    // mark the character set with a character
    void mark (const t_quad c) {
      // eventually create a unicode table
      if (p_utbl == nilp) p_utbl = new Unitabler;
      p_utbl->add (c, nilp);
    }
    // complement the whole character set
    void iset (const bool iflg) {
      d_iflg = iflg;
      if (p_next != nilp) p_next->iset (iflg);
    }
    // return true if a character belongs
    // to the character set recursively
    bool isvalid (const t_quad c) {
      // check meta
      bool mflg = (d_meta != nilq) ? re_check_meta (d_meta, c) : false;
      if ((mflg == true) && (d_iflg == false)) return true;
      // check the unicode table
      bool uflg = (p_utbl != nilp) ? p_utbl->exists (c) : false;
      // compute status and eventually revert the result
      bool status = mflg || uflg;
      if (d_iflg == true) status = !status;
      if ((status == true) || (p_next == nilp)) return status;
      // compute next value
      return p_next->isvalid (c);
    }
  };

  // check a character set against a character
  static bool re_check_cset (s_recset* cset, const t_quad c) {
    if (cset == nilp) return false;
    return cset->isvalid (c);
  }

  // the regex block node type
  enum t_renode {
    RE_CHAR, // character node
    RE_META, // meta character node 
    RE_CSET, // character set node
    RE_BLOK, // sub blok node
    RE_GMRK, // group mark
    RE_GSET, // group accept
    RE_OPRD  // operand node
  };

  // the regex operator type
  enum t_reoper {
    RE_NONE, // no operator
    RE_PLUS, // + operator
    RE_MULT, // * operator
    RE_ZONE, // ? operator
    RE_ALTN, // | operator
    RE_CTRL  // control node
  };

  // the regex node
  struct s_renode {
    // operator control
    t_reoper d_oper;
    // the node type
    t_renode d_type;
    // node data
    union {
      t_quad    d_cval;
      s_recset* p_cset;
      s_renode* p_nval;
    };
    // operand node
    s_renode* p_oprd;
    // next node in list
    s_renode* p_next;
    // protection flag
    bool d_prot;
    // create a new node by operator
    s_renode (t_renode type) {
      if (type != RE_CSET) {
      d_oper = RE_CTRL;
      d_type = type;
      d_cval = nilq;
      p_oprd = nilp;
      p_next = nilp;
      d_prot = false;
      } else {
      d_oper = RE_NONE;
      d_type = type;
      p_cset = new s_recset;
      p_oprd = nilp;
      p_next = nilp;
      d_prot = false;
      }
    }
    // create a new node by type and character
    s_renode (t_renode type, const t_quad c) {
      d_oper = RE_NONE;
      d_type = type;
      d_cval = c;
      p_oprd = nilp;
      p_next = nilp;
      d_prot = false;
    }
    // create a new block node
    s_renode (s_renode* node) {
      d_oper = RE_NONE;
      d_type = RE_BLOK;
      p_nval = node;
      p_oprd = nilp;
      p_next = nilp;
      d_prot = false;
    }
    // delete this node
    ~s_renode (void) {
      if (d_type == RE_CSET) {
      delete p_cset;
      p_cset = nilp;
      }
      if ((d_oper == RE_ALTN) && (p_next != nilp)) p_next->d_prot = true;
      if ((d_type == RE_BLOK) || (d_type == RE_OPRD)) {
      if ((p_nval != nilp) && (p_nval->d_prot == false)) delete p_nval;
      }
      if (d_type == RE_OPRD) {
      if ((p_oprd != nilp) && (p_oprd->d_prot == false)) delete p_oprd;
      }
      if ((d_oper == RE_ALTN) && (p_next != nilp)) p_next->d_prot = false;
      if ((p_next != nilp) && (p_next->d_prot == false)) delete p_next;
    }

    // mark a node as an operator - move node data if alternate
    void mark (t_reoper oper) {
      // check for alternate
      if (oper == RE_ALTN) {
      if (d_oper == RE_ALTN)
        throw Exception ("regex-error", "invalid dual | operator");
      // copy node
      s_renode* node = new s_renode (d_type);
      node->d_oper = d_oper;
      node->d_type = d_type;
      node->p_nval = p_nval;
      node->p_oprd = p_oprd;
      node->p_next = nilp;
      node->d_prot = d_prot;
      // re-adjust node
      d_oper = RE_ALTN;
      d_type = RE_OPRD;
      p_nval = node;
      p_oprd = nilp;
      p_next = nilp;
      return;
      }
      if (d_oper != RE_NONE) 
      throw Exception ("regex-error", "invalid operator position");
      d_oper = oper;
    }
  };

  // the regex structure
  struct s_regex {
    // the root node
    s_renode* p_root;
    // the last node
    s_renode* p_last;
    // the reference count
    long d_rcount;
    // initialize the regex structure
    s_regex (void) {
      p_root = nilp;
      p_last = nilp;
      d_rcount = 1;
    }
  };

  // read a character in the buffer - possibly escaped
  static t_quad re_escape_char (InputString& buf) {
    // check for escaped
    t_quad c = buf.rduc ();
    if (c != (t_quad) '\\') return c;
    // the character is escaped
    c = buf.rduc ();
    if (c == eofq) throw Exception ("regex-error", "invalid eof character");
    switch (c) {
    case 'n':  return eolq;
    case 't':  return tabq;
    case '"':  return (t_quad) '"';
    case '\\': return (t_quad) '\\';
    default:   return c;
    }
  }

  // find the last node in a node chain
  static s_renode* re_find_last (s_renode* node) {
    if (node == nilp) return nilp;
    while (node->p_next != nilp) node = node->p_next;
    return node;
  }

  // append a node to a node chain
  static void re_append_node (s_renode** root, s_renode** last, 
                        s_renode* node) {
    if (*root == nilp) {
      *root = node;
      *last = node;
      return;
    }
    // check if we have an alternate node
    if ((*last)->d_oper == RE_ALTN) {
      // mark operand node
      if ((*last)->p_oprd == nilp) {
      (*last)->p_oprd = node;
      return;
      }
      // add next node at the end of the alternate node
      s_renode* nlast = re_find_last ((*last)->p_nval);
      nlast->p_next = node;
      nlast = re_find_last ((*last)->p_oprd);
      nlast->p_next = node;
    }
    // at the end node
    (*last)->p_next = node;
    *last = node;
  }

  // regex context structure
  struct s_rectx {
    // input stream
    Input*  p_is;
    // string data 
    String  d_str;
    // input mode
    bool    d_imod;
    // string length
    long  d_len;
    // start index
    long  d_sidx;
    // current index;
    long  d_cidx;
    // group mark
    long  d_gmrk;
    // group vector
    Vector* p_grpv;
    // create a new string context at position 0
    s_rectx (const String& s, Vector* grpv) {
      p_is   = nilp;
      d_str  = s;
      d_imod = false;
      d_len  = s.length ();
      d_sidx = 0;
      d_cidx = d_sidx;
      d_gmrk = d_sidx;
      Object::iref (p_grpv = grpv);
    }
    // create a new string context at a given position
    s_rectx (const String& s, const long sidx, Vector* grpv) {
      p_is   = nilp;
      d_str  = s;
      d_imod = false;
      d_len  = s.length ();
      d_sidx = (sidx < d_len) ? sidx : d_len;
      d_cidx = d_sidx;
      d_gmrk = d_sidx;
      Object::iref (p_grpv = grpv);
    }
    // create an input stream context at a given position
    s_rectx (Input* is, Vector* grpv) {
      p_is   = is;
      d_str  = "";
      d_imod = true;
      d_len  = 0;
      d_sidx = 0;
      d_cidx = d_sidx;
      d_gmrk = d_sidx;
      Object::iref (p_grpv = grpv);
    }
    // create an input stream context with a prefix at a given position
    s_rectx (Input* is, const String& ps, Vector* grpv) {
      p_is   = is;
      d_str  = ps;
      d_imod = true;
      d_len  = ps.length ();
      d_sidx = 0;
      d_cidx = d_sidx;
      d_gmrk = d_sidx;
      Object::iref (p_grpv = grpv);
    }
    // copy construct this context
    s_rectx (const s_rectx& ctx) {
      p_is = ctx.p_is;
      d_str  = ctx.d_str;
      d_imod = ctx.d_imod;
      d_len  = ctx.d_len;
      d_sidx = ctx.d_sidx;
      d_cidx = ctx.d_cidx;
      d_gmrk = ctx.d_gmrk;
      Object::iref (p_grpv = ctx.p_grpv);
    }
    // destroy this context
    ~s_rectx (void) {
      Object::dref (p_grpv);
    }
    // assign a context to this one
    s_rectx& operator = (const s_rectx& ctx) {
      p_is   = ctx.p_is;
      d_str  = ctx.d_str;
      d_imod = ctx.d_imod;
      d_len  = ctx.d_len;
      d_sidx = ctx.d_sidx;
      d_cidx = ctx.d_cidx;
      d_gmrk = ctx.d_gmrk;
      Object::iref (ctx.p_grpv);
      Object::dref (p_grpv);
      p_grpv = ctx.p_grpv;
      return *this;
    }
    // restore the old context with a new one
    s_rectx& restore (const s_rectx& ctx) {
      // check for good mode
      if ((d_imod == false) || (ctx.d_imod == false)) return *this;
      // make sure the we have more characters to push back
      if (ctx.d_len <= d_len) return *this;
      for (long i = d_len; i < ctx.d_len; i++) p_is->pushback (ctx.d_str[i]);
      return *this;
    }
    // return the context string
    String getstr (void) const {
      return d_str;
    }
    // read a character from this context
    t_quad rduc (void) {
      if ((d_imod == true) && (d_cidx == d_len)) {
      if ((p_is != nilp) && (p_is->iseof () == false)) {
        t_quad c = p_is->rduc ();
        d_str = d_str + c;
        d_len++;
      }
      }
      return (d_cidx == d_len) ? eofc : d_str[d_cidx++];
    }
    // return true if we are at the end
    bool iseof (void) const {
      if (d_imod == true) {
      return (p_is == nilp) ? true : p_is->iseof ();
      } else {
      return (d_cidx == d_len);
      }
    }
    // mark the group start
    void gmrk (void) {
      d_gmrk = d_cidx;
    }
    // set a group result
    void gset (void) {
      long  len = d_cidx - d_gmrk;
      if (len <= 0) len = 0;
      t_quad* buf = new t_quad[len + 1];
      for (long i = 0; i < len; i++) buf[i] = d_str[i+d_gmrk];
      buf[len] = nilq;
      p_grpv->append (new String (buf));
      delete [] buf;
    }
    // pop a last entry in the group
    void gpop (void) {
      Object::cref (p_grpv->backt ());
    }
    // return the string match
    String subs (void) const {
      return d_str.substr (d_sidx, d_cidx);
    }
    // dump the context
    void dump (void) const {
      OutputTerm terr (OutputTerm::ERROR);
      terr << "ctx: str = " << d_str  << eolc;
      terr << "ctx: len = " << d_len  << eolc;
      terr << "ctx: cid = " << d_cidx << eolc;
    }
  };

  // forward declaration for node execution
  static bool re_exec      (s_renode* node, s_rectx& ctx);
  static bool re_exec_root (s_renode* node, s_rectx& ctx);
  static bool re_exec_node (s_renode* node, s_rectx& ctx);
  static bool re_exec_loop (s_renode* node, s_rectx& ctx);
  static bool re_exec_plus (s_renode* node, s_rectx& ctx);
  static bool re_exec_mult (s_renode* node, s_rectx& ctx);
  static bool re_exec_zone (s_renode* node, s_rectx& ctx);
  static bool re_exec_altn (s_renode* node, s_rectx& ctx);
  static bool re_exec_ctrl (s_renode* node, s_rectx& ctx);

  // execute at a particular node
  static bool re_exec (s_renode* node, s_rectx& ctx) {
    // without node - we succed
    if (node == nilp) return true;
    // initialize the status and save context
    s_rectx bctx = ctx;
    bool status  = false;
    // dispatch based on node operation
    switch (node->d_oper) {
    case RE_NONE:
      status = re_exec_node (node, ctx);
      if (status == false) {
      ctx = bctx.restore (ctx);
      break;
      }
      status = re_exec (node->p_next, ctx);
      break;
    case RE_PLUS:
      status = re_exec_plus (node, ctx);
      break;
    case RE_MULT:
      status = re_exec_mult (node, ctx);
      break;
    case RE_ZONE:
      status = re_exec_zone (node, ctx);
      break;
    case RE_ALTN:
      status = re_exec_altn (node, ctx);
      break;
    case RE_CTRL:
      status = re_exec_ctrl (node, ctx);
    }
    return status;
  }

  // execute the root node
  static bool re_exec_root (s_renode* node, s_rectx& ctx) {
    // without node - we succed
    if (node == nilp) return true;
    // the special case is the alternate node, since at the root
    // the sucess condition is attached with an end of file
    if (node->d_oper != RE_ALTN) return re_exec (node, ctx);
    // here we have an alternate node, so initialize the status 
    // and save context
    s_rectx bctx = ctx;
    bool status  = false;
    // try with first node
    status = re_exec (node->p_nval, ctx);
    if ((status == true) && (ctx.iseof () == true)) return true;
    // try with second node
    ctx = bctx.restore (ctx);
    status = re_exec (node->p_oprd, ctx);
    // restore context if failure
    if (status == false) ctx = bctx.restore (ctx);
    return status;
  }

  // execute a particular node
  static bool re_exec_node (s_renode* node, s_rectx& ctx) {
    // check for node and succed
    if (node == nilp) return true;
    bool status = false;
    // dispatch according to node type
    switch (node->d_type) {
    case RE_CHAR:
      status = (ctx.rduc () == node->d_cval);
      break;
    case RE_META:
      status = re_check_meta (node->d_cval, ctx.rduc ());
      break;
    case RE_CSET:
      status = re_check_cset (node->p_cset, ctx.rduc ());
      break;
    case RE_BLOK:
      status = re_exec (node->p_nval, ctx);
      break;
    default:
      throw Exception ("regex-error", "internal exec node error");
    }
    return status;
  }

  //  execute a loop with fall back
  static bool re_exec_loop (s_renode* node, s_rectx& ctx) {
    // save the context and execute the node
    s_rectx bctx = ctx;
    bool status  = ctx.iseof () ? false : re_exec_node (node, ctx);
    // restore context and check for fallback
    if (status == false) {
      ctx = bctx.restore (ctx);
      // try to fall back
      status = re_exec (node->p_next, ctx);
      if (status == false) {
      ctx = bctx.restore (ctx);
      return false;
      }
      return status;
    }
    status = re_exec_loop (node,ctx);
    if (status == false) {
      ctx = bctx.restore (ctx);
      // try to fall back
      status = re_exec (node->p_next, ctx);
      if (status == false) {
      ctx = bctx.restore (ctx);
      return false;
      }
      return status;
    }
    return status;
  }

  // execute a node with a plus operator
  static bool re_exec_plus (s_renode* node, s_rectx& ctx) {
    // save the context and reset status
    s_rectx bctx = ctx;
    bool status  = false;
    // execute the node once
    status = re_exec_node (node, ctx);
    if (status == false) {
      ctx = bctx.restore (ctx);
      return false;
    }
    // execute the loop with fallback
    return re_exec_loop (node, ctx);
  }

  // execute a node with a mult operator
  static bool re_exec_mult (s_renode* node, s_rectx& ctx) {
    return re_exec_loop (node,ctx);
  }

  // execute a node with a zone operator
  static bool re_exec_zone (s_renode* node, s_rectx& ctx) {
    // save the context and reset status
    s_rectx zctx = ctx;
    bool status  = false;
    // execute the node once
    status = re_exec_node (node, ctx);
    // fallback if wrong
    if (status == false) {
      ctx = zctx;
      return re_exec (node->p_next, ctx);
    }
    // try with one
    status = re_exec (node->p_next, ctx);
    if (status == false) {
      ctx = zctx;
      return re_exec (node->p_next, ctx);
    }
    return status;
  }

  // execute an alternate node
  static bool re_exec_altn (s_renode* node, s_rectx& ctx) {
    // save the context and reset status
    s_rectx bctx = ctx;
    bool status  = false;

    // try with first node
    status = re_exec (node->p_nval, ctx);
    if (status == false) {
      ctx = bctx.restore (ctx);
      status = re_exec (node->p_oprd, ctx);
      if (status == false) {
      ctx = bctx.restore (ctx);
      return status;
      }
    }
    return status;
  }

  // execute a control node
  static bool re_exec_ctrl (s_renode* node, s_rectx& ctx) {
    // save the context and reset status
    s_rectx bctx = ctx;
    bool status  = false;
    // dispatch from node type
    switch (node->d_type) {
    case RE_GMRK:
      ctx.gmrk ();
      status = re_exec (node->p_next, ctx);
      break;
    case RE_GSET:
      ctx.gset ();
      status = re_exec (node->p_next, ctx);
      if (status == false) ctx.gpop ();
      break;
    default:
      throw Exception ("regex-error", "internal regex control node error");
      break;
    }
    if (status == false) ctx = bctx.restore (ctx);
    return status;
  }

  // get or create the group vector
  static Vector* re_get_grpv (Thrmap& gmap) {
    Object* vobj = gmap.get ();
    if (vobj == nilp) gmap.set (vobj = new Vector);
    return dynamic_cast <Vector*> (vobj);
  }

  // fixup the root node by removing the block node
  static s_renode* re_remove_root_block (s_renode* root) {
    // check for blok node
    if (root == nilp) return nilp;
    if (root->d_type != RE_BLOK) return root;
    if (root->p_next != nilp)    return root;
    // save result node and fixup root
    s_renode* result = root->p_nval;
    root->p_nval = nilp;
    delete root;
    return result;
  }

  // compile from a stream and return a root node
  static s_renode* re_compile (InputString& is, const bool bflag) {
    // check for data
    if (is.iseof () == true) return nilp;

    // initialize the root and last node
    s_renode* root = nilp;
    s_renode* last = nilp;
    s_renode* node = nilp;

    // flag for balancing checks
    long bcount = bflag ? 1 : 0;
    long gcount = 0;
    // flags for character set complement
    bool flag = false;
    bool iflg = false;
    // loop in the expressions
    while (is.iseof () == false) {
      t_quad c = is.rduc ();
      switch (c) {
      case eofq:
      if (bcount != 0) 
        throw Exception ("regex-error", "unbalanced [] in expression");
      if (gcount != 0) 
        throw Exception ("regex-error", "unbalanced () in expression");
      return root;
      case ']':
      bcount--;
      if (bcount != 0) 
        throw Exception ("regex-error", "unbalanced [] in expression");
      if (gcount != 0) 
        throw Exception ("regex-error", "unbalanced () in expression");
      return root;      
      case '[':
      node = new s_renode (re_compile (is, true));
      re_append_node (&root, &last, node);
      break;
      case '(':
      gcount++;
      node = new s_renode (RE_GMRK);
      re_append_node (&root, &last, node);
      break;
      case ')':
      gcount--;
      node = new s_renode (RE_GSET);
      re_append_node (&root, &last, node);
      break;
      case '$':
      c = is.rduc ();
      if (c == eofq) throw Exception ("regex-error", "end of regex with $");
      node = new s_renode (RE_META, c);
      re_append_node (&root, &last, node);
      break;
      case '"':
      while ((is.iseof () == false) && (is.get () != '"')) {
        t_quad ec = re_escape_char (is);
        node = new s_renode (RE_CHAR, ec);
        re_append_node (&root, &last, node);
      }
      // consume last double quote
      c = is.rduc ();
      if (c != (t_quad) '"') 
        throw Exception ("regex-error", "unbalanced double quote");
      break;
      case '<':
      node = new s_renode (RE_CSET);
      while (is.get () != '>') {
        t_quad c = is.rduc ();
        if (flag == false) {
          flag = true;
          if (c == (t_quad) '^') {
            iflg = true;
            continue;
          }
        }
        if (c == eofq) {
          delete node;
          throw Exception ("regex-error", "end of regex with <");
        }
        if (c == (t_quad) '$') {
          c = is.rduc ();
          if (c == eofq) {
            delete node;
            throw Exception ("regex-error", "end of regex with $");
          }
          node->p_cset->meta (c);
        } else {
          node->p_cset->mark (c);
        } 
      }
      // consume last character - eventually complement and append
      is.rduc ();
      node->p_cset->iset (iflg);
      re_append_node (&root, &last, node);
      break;
      case '+':
      if (last == nilp) throw Exception ("regex-error", "invalid + start");
      last->mark (RE_PLUS);
      break;
      case '*':
      if (last == nilp) throw Exception ("regex-error", "invalid * start");
      last->mark (RE_MULT);
      break;
      case '?':
      if (last == nilp) throw Exception ("regex-error", "invalid ? start");
      last->mark (RE_ZONE);
      break;
      case '|':
      if (last == nilp) throw Exception ("regex-error", "invalid | start");
      last->mark (RE_ALTN);
      break;
      default:
      node = new s_renode (RE_CHAR, c);
      re_append_node (&root, &last, node);
      break;
      }
    }
    // check for balancing
    if (bcount != 0) 
      throw Exception ("regex-error", "unbalanced [] in expression");
    if (gcount != 0) 
      throw Exception ("regex-error", "unbalanced () in expression");
    return re_remove_root_block (root);
  }

  // -------------------------------------------------------------------------
  // - class section                                                         -
  // -------------------------------------------------------------------------

  // create a null regex

00927   Regex::Regex (void) {
    p_recni = new s_regex;
  }

  // create a regex from a string

00933   Regex::Regex (const String& re) {
    p_recni = new s_regex;
    compile (re);
  }

  // copy construct this regex
  
00940   Regex::Regex (const Regex& that) {
    that.rdlock ();
    d_reval = that.d_reval;
    p_recni = that.p_recni;
    p_recni->d_rcount++;
    that.unlock ();
  }

  // destroy this regex

00950   Regex::~Regex (void) {
    if (--p_recni->d_rcount == 0) {
      delete p_recni->p_root;
      delete p_recni;
    }
  }

  // assign a regex to this one

00959   Regex& Regex::operator = (const Regex& that) {
    // make sure the regex are not equal
    if (this == &that) return *this;
    // lock this and that
    that.rdlock ();
    wrlock ();
    // copy regex value
    d_reval = that.d_reval;
    // reference the regex structure
    if (--p_recni->d_rcount == 0) {
      delete p_recni->p_root;
      delete p_recni;      
    }
    p_recni = that.p_recni;
    p_recni->d_rcount++;
    // unlock everything
    unlock ();
    that.unlock ();
    return *this;
  }

  // assign a regex description to this regex

00982   Regex& Regex::operator = (const String& re) {
    compile (re);
    return *this;
  }

  // return the class name

00989   String Regex::repr (void) const {
    return "Regex";
  }

  // return a string representation of this regex

00995   String Regex::tostring (void) const {
    return d_reval;
  }

  // return a literal representation of this regex

01001   String Regex::toliteral (void) const {
    rdlock ();
    String result = (d_reval[0] == '[') ? d_reval : (String ("[") + d_reval 
                                                           + "]");
    unlock ();
    return result;
  }

  // clone this regex

01011   Object* Regex::clone (void) const {
    return new Regex (*this);
  }

  // return the regex serial code

01017   t_byte Regex::serialid (void) const {
    return SERIAL_REGX_ID;
  }

  // serialize this regex

01023   void Regex::wrstream (Output& os) const {
    rdlock ();
    d_reval.wrstream (os);
    unlock ();
  }

  // deserialize this regex

01031   void Regex::rdstream (Input& is) {
    wrlock ();
    try {
      String sval;
      sval.rdstream (is);
      compile (sval);
      unlock ();
    } catch (...) {
      unlock ();
      throw;
    }
  }

  // compile a string as a regex

01046   void Regex::compile (const String& re) {
    wrlock ();
    try {
      // clean the old compiled node info
      if (--p_recni->d_rcount == 0) {
      delete p_recni->p_root;
      delete p_recni;
      p_recni = new s_regex;
      }
      // create an input stream
      InputString is (re);
      // get the root and last node
      p_recni->p_root = re_compile   (is, false);
      p_recni->p_last = re_find_last (p_recni->p_root);
      // check for consistency
      if (is.iseof () == false) {
      delete p_recni->p_root;
      p_recni->p_root = nilp;
      p_recni->p_last = nilp;
      throw Exception ("regex-error", "regex syntax error", re);
      }
      // save the string regex
      d_reval = re;
      // unlock the regex
      unlock ();
    } catch (...) {
      delete p_recni->p_root;
      p_recni->p_root = nilp;
      p_recni->p_last = nilp;
      unlock ();
      throw;
    }
  }

  // match this regex against a string

01082   bool Regex::operator == (const String& s) const {
    // get the group vector and reset it
    Vector* grpv = re_get_grpv (d_gmap);
    if (grpv != nilp) grpv->reset ();
    // lock in read mode
    rdlock ();
    // create a regex context
    s_rectx ctx (s, 0, grpv);
    try {
      bool result = re_exec_root (p_recni->p_root, ctx) & ctx.iseof ();
      unlock ();
      return result;
    } catch (...) {
      if (grpv != nilp) grpv->reset ();
      unlock ();
      throw;
    }
  }

  // match this regex partially against a string

01103   bool Regex::operator < (const String& s) const {
    // get the group vector
    Vector* grpv = re_get_grpv (d_gmap);
    // lock in read mode
    rdlock ();
    long len = s.length ();
    for (long i = 0; i < len; i++) {
      // reset the group vector
      if (grpv != nilp) grpv->reset ();
      // create a regex context
      s_rectx ctx (s, i, grpv);
      try {
      bool result = re_exec (p_recni->p_root, ctx);
      if (result == false) continue;
      unlock ();
      return result;
      } catch (...) {
      if (grpv != nilp) grpv->reset ();
      unlock ();
      throw;
      }
    }
    unlock ();
    return false;
  }

  // reverse matching the regex

01131   bool Regex::operator != (const String& s) const {
    return (*this == s) ? false : true;
  }

  // match this regex with an input stream

01137   String Regex::match (Input* is) const {
    if (is == nilp) return "";
    return match (is, "");
  }

  // match this regex with an input stream and an initial string

01144   String Regex::match (Input* is, const String& ps) const {
    // get the group vector
    Vector* grpv = re_get_grpv (d_gmap);
    rdlock ();
    // protect the input stream
    Object::iref (is);
    try {
      // reset the group vector
      if (grpv != nilp) grpv->reset ();
      // create a regex context
      s_rectx ctx (is, ps, grpv);
      // execute the regex
      bool status = re_exec (p_recni->p_root, ctx);
      String result = status ? ctx.getstr () : "";
      // unportect the stream
      Object::tref (is);
      // unlock and return
      unlock ();
      return result;
    } catch (...) {
      // reset the group vector
      if (grpv != nilp) grpv->reset ();
      // unportect the stream
      Object::tref (is);
      // unlock and throw
      unlock ();
      throw;
    }
  }

  // match this regex partially and return the matching string

01176   String Regex::match (const String& s) const {
    // get the group vector
    Vector* grpv = re_get_grpv (d_gmap);
    // lock in read mode
    rdlock ();
    long len = s.length ();
    for (long i = 0; i < len; i++) {
      // reset the group vector
      if (grpv != nilp) grpv->reset ();
      // create a regex context
      s_rectx ctx (s, i, grpv);
      try {
      if (re_exec (p_recni->p_root, ctx) == false) continue;
      String result = ctx.subs ();
      unlock ();
      return result;
      } catch (...) {
      if (grpv != nilp) grpv->reset ();
      unlock ();
      throw;
      }
    }
    unlock ();
    return "";
  }

  // replace a match with another string

01204   String Regex::replace (const String& s, const String& val) const {
    Buffer result;
    // get the group vector
    Vector* grpv = re_get_grpv (d_gmap);
    // lock in read mode
    rdlock ();
    long len = s.length ();
    for (long i = 0; i < len; i++) {
      // reset the group vector
      if (grpv != nilp) grpv->reset ();
      // create a regex context
      s_rectx ctx (s, i, grpv);
      try {
      if (re_exec_root (p_recni->p_root, ctx) == false) {
        result.add (s[i]);
        continue;
      }
      result.add (val);
      i = ctx.d_cidx - 1;
      } catch (...) {
      if (grpv != nilp) grpv->reset ();
      unlock ();
      throw;
      }
    }
    unlock ();
    return result.tostring ();
  }

  // get the length of the group vector

01235   long Regex::length (void) const {
    // get the group vector
    Vector* grpv = re_get_grpv (d_gmap);
    // get its length
    return (grpv == nilp) ? 0 : grpv->length ();
  }

  // get an object at certain index

01244   Object* Regex::getobj (const long index) const {
    // get the group vector
    Vector* grpv = re_get_grpv (d_gmap);
    return  (grpv == nilp) ? nilp : grpv->get (index);
  }

  // get a string at certain index

01252   String Regex::getstr (const long index) const {
    // get the group vector
    Vector* grpv = re_get_grpv (d_gmap);
    if (grpv == nilp) 
      throw Exception ("regex-error", "out of bound group access");
    return grpv->getstring (index);
  }

  // get an integer at certain index

01262   t_long Regex::getint (const long index) const {
    // get the group vector
    Vector* grpv = re_get_grpv (d_gmap);
    if (grpv == nilp) 
      throw Exception ("regex-error", "out of bound group access");
    Integer val (grpv->getstring (index));
    return val.tointeger ();
  }

  // get a real at certain index

01273   t_real Regex::getreal (const long index) const {
    // get the group vector
    Vector* grpv = re_get_grpv (d_gmap);
    if (grpv == nilp) 
      throw Exception ("regex-error", "out of bound group access");
    Real val (grpv->getstring (index));
    return val.toreal ();
  }

  // -------------------------------------------------------------------------
  // - object section                                                        -
  // -------------------------------------------------------------------------

  // the quark zone
  static const long QUARK_ZONE_LENGTH = 4;
  static QuarkZone  zone (QUARK_ZONE_LENGTH);

  // the object supported quarks
  static const long QUARK_GET      = zone.intern ("get");
  static const long QUARK_MATCH    = zone.intern ("match");
  static const long QUARK_LENGTH   = zone.intern ("length");
  static const long QUARK_REPLACE  = zone.intern ("replace");

  // create a new object in a generic way

01298   Object* Regex::mknew (Vector* argv) {
    long argc = (argv == nilp) ? 0 : argv->length ();
    // check for 0 argument
    if (argc == 0) return new Regex;
    // check for 1 argument
    if (argc == 1) {
      String str = argv->getstring (0);
      return new Regex (str);
    }
    throw Exception ("argument-error", "too many argument with regex");
  }

  // return true if the given quark is defined

01312   bool Regex::isquark (const long quark, const bool hflg) const {
    rdlock ();
    if (zone.exists (quark) == true) {
      unlock ();
      return true;
    }
    bool result = hflg ? Literal::isquark (quark, hflg) : false;
    unlock ();
    return result;
  }

  // operate this object with another object

01325   Object* Regex::oper (t_oper type, Object* object) {
    // get a literal object
    Literal* lobj = dynamic_cast <Literal*> (object);
    switch (type) {
    case Object::EQL:
      if (lobj != nilp) return new Boolean (*this == lobj->tostring ());
      break;
    case Object::NEQ:
      if (lobj != nilp) return new Boolean (*this != lobj->tostring ());
      break;
    case Object::LTH:
      if (lobj != nilp) return new Boolean (*this < lobj->tostring ());
      break;      
    default:
      break;
    }
    throw Exception ("type-error", "invalid operand with regex",
                 Object::repr (object));
  }

  // set an object to this regex

01347   Object* Regex::vdef (Runnable* robj, Nameset* nset, Object* object) {
    String* sobj = dynamic_cast <String*> (object);
    if (sobj != nilp) {
      compile (*sobj);
      return this;
    }
    Regex* gobj = dynamic_cast <Regex*> (object);
    if (gobj != nilp) {
      *this = *gobj;
      return this;
    }
    throw Exception ("type-error", "invalid object with regex vdef",
                 Object::repr (object));
  }

  // apply this object with a set of arguments and a quark

01364   Object* Regex::apply (Runnable* robj, Nameset* nset, const long quark,
                  Vector* argv) {
    // get the number of arguments
    long argc = (argv == nilp) ? 0 : argv->length ();

    // dispatch 0 argument
    if (argc == 0) {
      if (quark == QUARK_LENGTH) return new Integer (length ());
    } 

    // dispatch 1 argument
    if (argc == 1) {
      if (quark == QUARK_GET) {
      Object* result = getobj (argv->getint (0));
      robj->post (result);
      return result;
      }
      if (quark == QUARK_MATCH) {
      Object*  obj = argv->get (0);
      // check for a stream
      Input*  iobj = dynamic_cast <Input*> (obj);
      if (iobj != nilp) return new String (match (iobj));
      // check for a string
      String* sobj = dynamic_cast <String*> (obj);
      if (sobj != nilp) return new String (match (*sobj));
      // argument error
      throw Exception ("type-error", "invalid object with match ", 
                   Object::repr (obj));
      }
    }

    // dispatch 2 arguments
    if (argc == 2) {
      if (quark == QUARK_REPLACE) {
      String s   = argv->getstring (0);
      String val = argv->getstring (1);
      String* result = new String (replace (s, val));
      robj->post (result);
      return result;
      }
      if (quark == QUARK_MATCH) {
      // get the input stream
      Object*  obj = argv->get (0);
      Input*  iobj = dynamic_cast <Input*> (obj);
      // get the prefix
      String ps = argv->getstring (1);
      if (iobj != nilp) return new String (match (iobj, ps));
      throw Exception ("type-error", "invalid object with match ", 
                   Object::repr (obj));
      }
    }    
    // call the literal method
    return Literal::apply (robj, nset, quark, argv);
  }
}

Generated by  Doxygen 1.6.0   Back to index