Logo Search packages:      
Sourcecode: afnix version File versions

Unicode.cpp

// ---------------------------------------------------------------------------
// - Unicode.cpp                                                             -
// - standard object library - unicode functions class implementation        -
// ---------------------------------------------------------------------------
// - This program is free software;  you can redistribute it  and/or  modify -
// - it provided that this copyright notice is kept intact.                  -
// -                                                                         -
// - This program  is  distributed in  the hope  that it will be useful, but -
// - without  any  warranty;  without  even   the   implied    warranty   of -
// - merchantability or fitness for a particular purpose.  In no event shall -
// - the copyright holder be liable for any  direct, indirect, incidental or -
// - special damages arising in any way out of the use of this software.     -
// ---------------------------------------------------------------------------
// - copyright (c) 1999-2007 amaury darsch                                   -
// ---------------------------------------------------------------------------

#include "Ascii.hpp"
#include "Unicode.hpp"
#include "Utility.hpp"
#include "Exception.hpp"
#include "cucd.hpp"

namespace afnix {

  // convert a unicode character to a native character if possible

00027   char Unicode::tochar (const t_quad value) {
    // check for 8 bit range
    if ((value & 0xFFFFFF00) != 0x00000000)
      throw Exception ("unicode-error", "cannot convert unicode character");
    // map the character
    char result = (char) (value & 0x000000FF);
    return result;
  }

  // convert a hexadecimal character to a byte

00038   t_byte Unicode::htob (const t_quad value) {
    char c = Unicode::tochar (value);
    return Ascii::htob (c);
  }

  // convert a native character to a unicode character

00045   t_quad Unicode::toquad (const char value) {
    t_quad result = value;
    return result & 0x000000ff;
  }

  // convert a string representation to a character

00052   t_quad Unicode::toquad (const String& value) {
    long slen = value.length ();
    // check for single character
    if (slen == 1) {
      t_quad result = value[0];
      return result;
    }
    // check for ascii representation
    if ((slen > 2) && (value[0] == '\'')) {
      t_quad result = Unicode::toquad (Ascii::tochar (value));
      return result;
    }
    // check for unicode representation
    if ((slen > 2) && (value[0] == 'U') && (value[1] == '+')) {
      // format the string
      String format = "0x";
      format += value.rsubstr (2);
      // convert to quad
      return (t_quad) Utility::tointeger (format);
    }
    // invalid format
    throw Exception ("format-error",
                 "illegal unicode string representation", value);
  }

  // convert a unicode character value to a string

00079   String Unicode::tostring (const t_quad value) {
    // check for an ascii character
    if ((value & 0xFFFFFF00) == 0x00000000) {
      char cval = (char) (value & 0x000000FF);
      String result = Ascii::tostring (cval);
      return result;
    }
    // we are outside the ascii range, so use the unicode representation
    String result = "U+";
    result += Utility::tohexa ((long) value);
    return result;
  }

  // convert a native character value to a literal string

00094   String Unicode::toliteral (const t_quad value) {
    String result;
    if (Unicode::isascii (value) == true) {
      char cval = (char) (value & 0x000000FF);
      result += '\'';
      result += cval;
      result += '\'';
    } else {
      result += '"';
      result += Unicode::tostring (value);
      result += '"';
    }
    return result;
  }

  // get the size of unicode array

00111   long Unicode::strlen (const t_quad* s) {
    // check for nil string
    if (s == nilp) return 0;
    // compute length
    long result = 0;
    while (*s++ != nilq) result++;
    return result;
  }

  // compare two strings and returns true if they are equals.

00122   bool Unicode::strcmp (const t_quad* s1, const char* s2) {
    // nil case first
    if ((s1 == nilp) &&  (s2 == nilp)) return true;
    if ((s1 == nilp) &&  (s2 != nilp) && (*s2 == nilc)) return true;
    if ((s1 == nilp) &&  (s2 != nilp) && (*s2 != nilc)) return false;
    if ((s1 != nilp) && (*s1 == nilq) &&  (s2 == nilp)) return true;
    if ((s1 != nilp) && (*s1 != nilq) &&  (s2 == nilp)) return false;
    // check first character for fast compare
    if (*s1 != Unicode::toquad (*s2)) return false;
    // normal compare now
    while (*s1 != nilq) {
      if (*s2 == nilc) break;
      if (*s1++ != Unicode::toquad (*s2++)) return false;
    }
    return (*s1 == Unicode::toquad (*s2)) ? true : false;
  }

  // compare two strings and returns true if they are equals.

00141   bool Unicode::strcmp (const t_quad* s1, const t_quad* s2) {
    // nil case first
    if ((s1 == nilp) &&  (s2 == nilp)) return true;
    if ((s1 == nilp) &&  (s2 != nilp) && (*s2 == nilq)) return true;
    if ((s1 == nilp) &&  (s2 != nilp) && (*s2 != nilq)) return false;
    if ((s1 != nilp) && (*s1 == nilq) &&  (s2 == nilp)) return true;
    if ((s1 != nilp) && (*s1 != nilq) &&  (s2 == nilp)) return false;
    // check first character for fast compare
    if (*s1 != *s2) return false;
    // normal compare now
    while (*s1 != nilq) {
      if (*s2 == nilq) break;
      if (*s1++ != *s2++) return false;
    }
    return (*s1 == *s2) ? true : false;
  }

  // compare two strings upto n characters

00160   bool Unicode::strncmp (const t_quad* s1, const char* s2, const long size) {
    // nil case compare
    if (size == 0) return true;
    long len1 = Unicode::strlen (s1);
    long len2 = Ascii::strlen   (s2);
    if ((len1 == 0) || (len2 == 0)) return false;
    // normal compare
    for (long i = 0; i < size; i++) {
      if ((s1[i] == nilq) && (s2[i] == nilc)) return false;
      if (s1[i] != Unicode::toquad (s2[i])) return false;
    }
    return true;
  }

  // compare two strings upto n characters

00176   bool Unicode::strncmp (const t_quad* s1, const t_quad* s2, const long size) {
    // nil case compare
    if (size == 0) return true;
    long len1 = Unicode::strlen (s1);
    long len2 = Unicode::strlen (s2);
    if ((len1 == 0) || (len2 == 0)) return false;
    // normal compare
    for (long i = 0; i < size; i++) {
      if ((s1[i] == nilq) && (s2[i] == nilq)) return false;
      if (s1[i] != s2[i]) return false;
    }
    return true;
  }

  // compare two strings - less than operator

00192   bool Unicode::strlth (const t_quad* s1, const char* s2) {
    // nil case first
    if ((s1 == nilp) &&  (s2 == nilp)) return false;
    if ((s1 == nilp) &&  (s2 != nilp) && (*s2 == nilc)) return false;
    if ((s1 == nilp) &&  (s2 != nilp) && (*s2 != nilc)) return true;
    if ((s1 != nilp) &&  (s2 == nilp) && (*s1 == nilq)) return false;
    if ((s1 != nilp) &&  (s2 == nilp) && (*s1 != nilq)) return false;
    // compare without equal
    while (*s1 != nilq) {
      if (*s1   < Unicode::toquad (*s2))   return true;
      if (*s1++ > Unicode::toquad (*s2++)) return false;
    }
    return false;
  }


  // compare two strings - less than operator

00210   bool Unicode::strlth (const t_quad* s1, const t_quad* s2) {
    // nil case first
    if ((s1 == nilp) &&  (s2 == nilp)) return false;
    if ((s1 == nilp) &&  (s2 != nilp) && (*s2 == nilq)) return false;
    if ((s1 == nilp) &&  (s2 != nilp) && (*s2 != nilq)) return true;
    if ((s1 != nilp) &&  (s2 == nilp) && (*s1 == nilq)) return false;
    if ((s1 != nilp) &&  (s2 == nilp) && (*s1 != nilq)) return false;
    // compare without equal
    while (*s1 != nilq) {
      if (*s1   < *s2)   return true;
      if (*s1++ > *s2++) return false;
    }
    return false;
  }

  // compare two strings - less equal operator

00227   bool Unicode::strleq (const t_quad* s1, const char* s2) {
    // nil case first
    if ((s1 == nilp) &&  (s2 == nilp)) return true;
    if ((s1 == nilp) &&  (s2 != nilp) && (*s2 == nilc)) return true;
    if ((s1 == nilp) &&  (s2 != nilp) && (*s2 != nilc)) return true;
    if ((s1 != nilp) &&  (s2 == nilp) && (*s1 == nilq)) return true;
    if ((s1 != nilp) &&  (s2 == nilp) && (*s1 != nilq)) return false;
    // compare with equal
    while (*s1 != nilq) {
      if (*s1   < Unicode::toquad (*s2))   return true;
      if (*s1++ > Unicode::toquad (*s2++)) return false;
    }
    return true;
  }

  // compare two strings - less equal operator

00244   bool Unicode::strleq (const t_quad* s1, const t_quad* s2) {
    // nil case first
    if ((s1 == nilp) &&  (s2 == nilp)) return true;
    if ((s1 == nilp) &&  (s2 != nilp) && (*s2 == nilq)) return true;
    if ((s1 == nilp) &&  (s2 != nilp) && (*s2 != nilq)) return true;
    if ((s1 != nilp) &&  (s2 == nilp) && (*s1 == nilq)) return true;
    if ((s1 != nilp) &&  (s2 == nilp) && (*s1 != nilq)) return false;
    if ((s1 != nilp) &&  (s2 == nilp) && (*s1 != nilq)) return false;
    // compare with equal
    while (*s1 != nilq) {
      if (*s1   < *s2)   return true;
      if (*s1++ > *s2++) return false;
    }
    return true;
  }

  // normalize a string into a new one

  //t_quad* Unicode::strnrm (const t_quad* s) {
  //  // allocate sufficent space
  //  long slen = strlen (s);
  //  long blen = slen * UCD_CDV_MAX + 1;
  //  t_quad* buf = new t_quad[blen];
  //  // normalize the string
  //  long bi = 0;
  //  for (long si = 0; si < slen; si++) {
  //    // get the character
  //    t_quad code = s[si];
  // }
  // }

  // convert an ascii character to an unicode array

00277   t_quad* Unicode::strmak (const char value) {
    t_quad* result = new t_quad[2];
    result[0] = Unicode::toquad (value);
    result[1] = nilq;
    return result;
  }

  // convert a unicode character to an unicode array

00286   t_quad* Unicode::strmak (const t_quad value) {
    t_quad* result = new t_quad[2];
    result[0] = value;
    result[1] = nilq;
    return result;
  }

  // create a unicode string from a string and a character

00295   t_quad* Unicode::strmak (const t_quad* s, const char c) {
    return Unicode::strmak (s, Unicode::toquad (c));
  }

  // create a unicode string from a string one and a unicode character

00301   t_quad* Unicode::strmak (const t_quad* s, const t_quad c) {
    // compute size
    long slen = Unicode::strlen (s);
    t_quad* result = new t_quad[slen+2];
    // copy string directly
    for (long i = 0; i < slen; i++) result[i] = s[i];
    // add character and nilq
    result[slen] = c;
    result[slen+1] = nilq;
    return result;
  }

  // create a unicode string from a character and a string

00315   t_quad* Unicode::strmak (const char c, const t_quad* s) {
    return Unicode::strmak (Unicode::toquad (c), s);
  }

  // create a unicode string from a unicode character and a string

00321   t_quad* Unicode::strmak (const t_quad c, const t_quad* s) {
    // compute size
    long slen = Unicode::strlen (s);
    t_quad* result = new t_quad[slen+2];
    // add character first
    result[0] = c;
    // copy string directly
    for (long i = 0; i < slen; i++) result[i+1] = s[i];
    // mark end of string
    result[slen+1] = nilq;
    return result;
  }

  // convert an ascii string to an unicode array

00336   t_quad* Unicode::strdup (const char* s) {
    // check for null string if the length is null
    long len = Ascii::strlen (s);
    // allocate a new string with new so delete can be used
    t_quad* result = new t_quad[len + 1];
    if (len > 0) {
      for (long i = 0; i < len + 1; i++) result[i] = Unicode::toquad (s[i]);
    }
    result[len] = nilq;
    return result;
  }

  // convert a unicode string to an unicode array

00350   t_quad* Unicode::strdup (const t_quad* s) {
    // check for null string if the length is null
    long len = Unicode::strlen (s);
    // allocate a new string with new so delete can be used
    t_quad* result = new t_quad[len + 1];
    if (len > 0) {
      for (long i = 0; i < len + 1; i++) result[i] = s[i];
    }
    result[len] = nilq;
    return result;
  }

  // convert a unicode string to an unicode array by size

00364   t_quad* Unicode::strdup (const t_quad* s, const long size) {
    // allocate a new string with new so delete can be used
    t_quad* result = new t_quad[size + 1];
    if (size > 0) {
      for (long i = 0; i < size + 1; i++) result[i] = s[i];
    }
    result[size] = nilq;
    return result;
  }

  // copy a c-string from a source to a unicode destination

00376   void Unicode::strcpy (t_quad* dst, const char* src) {
    // standard check as usual
    if (dst == nilp) return;
    // get the length and copy
    long len = Ascii::strlen (src);
    if (len > 0) {
      for (long i = 0; i < len + 1; i++) {
      dst[i] = Unicode::toquad (src[i]);
      }
    }
    dst[len] = nilq;
  }

  // copy a unicode string from a source to a unicode destination

00391   void Unicode::strcpy (t_quad* dst, const t_quad* src) {
    // standard check as usual
    if (dst == nilp) return;
    // get the length and copy
    long len = Unicode::strlen (src);
    if (len > 0) {
      for (long i = 0; i < len + 1; i++) dst[i] = src[i];
    }
    dst[len] = nilq;
  }

  // concatenate a c-string with a unicode string

00404   void Unicode::strcat (t_quad* dst, const char* src) {
    // standard check as usual
    if (dst == nilp) return;
    // get length and position
    long len = Ascii::strlen   (src);
    long pos = Unicode::strlen (dst);
    if (len > 0) {
      for (long i = 0; i < len + 1; i++) {
      dst[pos + i] = Unicode::toquad (src[i]);
      }
    }
    dst[pos+len] = nilq;
  }

  // concatenate a unicode string with a unicode string

00420   void Unicode::strcat (t_quad* dst, const t_quad* src) {
    // standard check as usual
    if (dst == nilp) return;
    // get length and position
    long len = Unicode::strlen (src);
    long pos = Unicode::strlen (dst);
    if (len > 0) {
      for (long i = 0; i < len + 1; i++) dst[pos + i] = src[i];
    }
    dst[pos+len] = nilq;
  }

  // remove the leading blank and tab and return a new string

00434   t_quad* Unicode::rmlead (const char* s) {
    // get the length and remove
    long len = Ascii::strlen (s);
    if (len != 0) {
      // remove leading blank
      while ((*s != nilc) && ((*s == blkc) || (*s == tabc))) s++;
    }
    // now copy
    return Unicode::strdup (s);
  }
 
  // remove the leading blank and tab and return a new string

00447   t_quad* Unicode::rmlead (const t_quad* s) {
    // get the length and remove
    long len = Unicode::strlen (s);
    if (len != 0) {
      // remove leading blank
      while ((*s != nilq) && ((*s == blkq) || (*s == tabq))) s++;
    }
    // now copy
    return Unicode::strdup (s);
  }

  // remove the trailing blank from a string and return a new string

00460   t_quad* Unicode::rmtrail (const char* s) {
    // get the length and check
    long len = Ascii::strlen (s);
    if (len == 0) return Unicode::strdup (s);
    char* sbuf = Ascii::strdup (s);
    char* end  = sbuf + len - 1;
    // remove trailing blank
    while ((end != s) && ((*end == blkc) || (*end == tabc))) *end-- = nilc;
    // now copy and return
    t_quad* result = Unicode::strdup (sbuf);
    delete [] sbuf;
    return result;
  }

  // remove the trailing blank from a string and return a new string

00476   t_quad* Unicode::rmtrail (const t_quad* s) {
    // get the length and check
    long len = Unicode::strlen (s);
    if (len == 0) return Unicode::strdup (s);
    t_quad* sbuf = Unicode::strdup (s);
    t_quad* end  = sbuf + len - 1;
    // remove trailing blank
    while ((end != s) && ((*end == blkq) || (*end == tabq))) *end-- = nilq;
    // now copy and return
    t_quad* result = Unicode::strdup (sbuf);
    delete [] sbuf;
    return result;
  }

  // convert a character from upper case to lower case
  static long unicode_tolower (t_quad* dst, const t_quad src) {
    // get the ucd record and do nothing if it does not exist
    const ucd_s* ucd = c_getucd (src);
    if (ucd == nilp) {
      dst[0] = src;
      return 1;
    }
    // get the lower map until nil
    long result = 0;
    for (long i = 0; i < UCD_LCM_MAX; i++) {
      t_quad c = ucd->d_lmap[i];
      if (c == nilq) break;
      dst[i] = c;
      result++;
    }
    // if the result is null just map the existsing character
    if (result == 0) dst[0] = src;
    return result;
  }

  // convert an ascii string to lower case

00513   t_quad* Unicode::tolower (const char* s) {
    // check for length
    if (s == nilp) return nilp;
    long len = Ascii::strlen (s);
    // allocate and convert
    long    size = len * UCD_LCM_MAX + 1;
    t_quad* sbuf = new t_quad[size];
    long    sidx = 0;
    t_quad  sdst[UCD_LCM_MAX];
    for (long i = 0; i < len; i++) {
      long cnvs = unicode_tolower (sdst, Unicode::toquad (s[i]));
      for (long j = 0; j < cnvs; j++) sbuf[sidx++] = sdst[j];
    }
    // mark end of string
    sbuf[sidx] = nilq;
    // copy and clean
    t_quad* result = Unicode::strdup (sbuf);
    delete [] sbuf;
    return result;
  }

  // convert an ascii string to lower case

00536   t_quad* Unicode::tolower (const t_quad* s) {
    // check for length
    if (s == nilp) return nilp;
    long len = Unicode::strlen (s);
    // allocate and convert
    long    size = len * UCD_LCM_MAX + 1;
    t_quad* sbuf = new t_quad[size];
    long    sidx = 0;
    t_quad  sdst[UCD_LCM_MAX];
    for (long i = 0; i < len; i++) {
      long cnvs = unicode_tolower (sdst, s[i]);
      for (long j = 0; j < cnvs; j++) sbuf[sidx++] = sdst[j];
    }
    // mark end of string
    sbuf[sidx] = nilq;
    // copy and clean
    t_quad* result = Unicode::strdup (sbuf);
    delete [] sbuf;
    return result;
  }

  // convert a character from lowercase to upper case

  static long unicode_toupper (t_quad* dst, const t_quad src) {
    // get the ucd record and do nothing if it does not exist
    const ucd_s* ucd = c_getucd (src);
    if (ucd == nilp) {
      dst[0] = src;
      return 1;
    }
    // get the lower map until nil
    long result = 0;
    for (long i = 0; i < UCD_UCM_MAX; i++) {
      t_quad c = ucd->d_umap[i];
      if (c == nilq) break;
      dst[i] = c;
      result++;
    }
    // if the result is null just map the existsing character
    if (result == 0) dst[0] = src;
    return result;
  }

  // convert an ascii string to upper case

00581   t_quad* Unicode::toupper (const char* s) {
    // check for length
    if (s == nilp) return nilp;
    long len = Ascii::strlen (s);
    // allocate and convert
    long    size = len * UCD_UCM_MAX + 1;
    t_quad* sbuf = new t_quad[size];
    long    sidx = 0;
    t_quad  sdst[UCD_UCM_MAX];
    for (long i = 0; i < len; i++) {
      long cnvs = unicode_toupper (sdst, Unicode::toquad (s[i]));
      for (long j = 0; j < cnvs; j++) sbuf[sidx++] = sdst[j];
    }
    // mark end of string
    sbuf[sidx] = nilq;
    // copy and clean
    t_quad* result = Unicode::strdup (sbuf);
    delete [] sbuf;
    return result;
  }

  // convert an unicode string to upper case

00604   t_quad* Unicode::toupper (const t_quad* s) {
    // check for length
    if (s == nilp) return nilp;
    long len = Unicode::strlen (s);
    // allocate and convert
    long    size = len * UCD_UCM_MAX + 1;
    t_quad* sbuf = new t_quad[size];
    long    sidx = 0;
    t_quad  sdst[UCD_UCM_MAX];
    for (long i = 0; i < len; i++) {
      long cnvs = unicode_toupper (sdst, s[i]);
      for (long j = 0; j < cnvs; j++) sbuf[sidx++] = sdst[j];
    }
    // mark end of string
    sbuf[sidx] = nilq;
    // copy and clean
    t_quad* result = Unicode::strdup (sbuf);
    delete [] sbuf;
    return result;
  }

  // return true if the character is a lower character

00627   bool Unicode::islower (const t_quad value) {
    // get the ucd record and do nothing if it does not exist
    const ucd_s* ucd = c_getucd (value);
    if (ucd == nilp) return false;
    // check for lower case code
    return (ucd->d_pgcv == UCD_GCV_LL);
  }

  // return true if the character is an upper character

00637   bool Unicode::isupper (const t_quad value) {
    // get the ucd record and do nothing if it does not exist
    const ucd_s* ucd = c_getucd (value);
    if (ucd == nilp) return false;
    // check for lower case code
    return (ucd->d_pgcv == UCD_GCV_LU);
  }

  // return true if the unicode character is a letter

00647   bool Unicode::isletter (const t_quad value) {
    // get the ucd record and do nothing if it does not exist
    const ucd_s* ucd = c_getucd (value);
    if (ucd == nilp) return false;
    // get the gcv byte and check
    t_byte gcv = ucd->d_pgcv;
    if (gcv == UCD_GCV_LU) return true;
    if (gcv == UCD_GCV_LL) return true;
    if (gcv == UCD_GCV_LT) return true;
    if (gcv == UCD_GCV_LM) return true;
    if (gcv == UCD_GCV_LO) return true;
    return false;
  }

  // return true if the unicode character is a digit

00663   bool Unicode::isdigit (const t_quad value) {
    // get the ucd record and do nothing if it does not exist
    const ucd_s* ucd = c_getucd (value);
    if (ucd == nilp) return false;
    // get the gcv byte and check
    t_byte gcv = ucd->d_pgcv;
    if (gcv == UCD_GCV_ND) return true;    
    return false;
  }

  // return true if the unicode character is an alpha-numeric character

00675   bool Unicode::isalpha (const t_quad value) {
    // check for a digit
    if (Unicode::isdigit (value) == true) return true;
    // check for letter
    if (Unicode::isletter (value) == true) return true;
    // not alpha
    return false;
  }

  // return true if the unicode character is a blank or tab
  
00686   bool Unicode::isblank (const t_quad value) {
    if ((value == blkq) || (value == tabq)) return true;
    return false;
  }

  // return true if the unicode character is an ascii character
  
00693   bool Unicode::isascii (const t_quad value) {
    if ((value & 0xFFFFFF80) == 0x00000000) return true;
    return false;
  }

  // return true if the unicode character is a latin character
  
00700   bool Unicode::islatin (const t_quad value) {
    if ((value & 0xFFFFFF00) == 0x00000000) return true;
    return false;
  }

  // return true if the unicode character is an hexadecimal character
  
00707   bool Unicode::ishexa (const t_quad value) {
    if ((value >= (t_quad) '0') && (value <= (t_quad) '9')) return true;
    if ((value >= (t_quad) 'a') && (value <= (t_quad) 'f')) return true;
    if ((value >= (t_quad) 'A') && (value <= (t_quad) 'F')) return true;
    return false;
  }

  // return true if the character is an afnix constituent

00716   bool Unicode::isafnix (const t_quad value) {
    // check for an alhpa character
    if (isalpha (value) == true) return true;
    // check for other constituents
    if (value == (t_quad) '.') return true;
    if (value == (t_quad) '+') return true;
    if (value == (t_quad) '-') return true;
    if (value == (t_quad) '*') return true;
    if (value == (t_quad) '/') return true;
    if (value == (t_quad) '!') return true;
    if (value == (t_quad) '=') return true;
    if (value == (t_quad) '.') return true;
    if (value == (t_quad) '>') return true;
    if (value == (t_quad) '<') return true;
    if (value == (t_quad) '?') return true;
    return false;
  }

  // encode a unicode character in UTF-8

00736   char* Unicode::encode (const t_quad c) {
    return Unicode::encode (&c, 1);
  }

  // encode a unicode string in UTF-8

00742   char* Unicode::encode (const t_quad* s) {
    // get the size and encode
    long size = Unicode::strlen (s);
    return encode (s, size);
  }

  // encode a unicode string in UTF-8
  
00750   char* Unicode::encode (const t_quad* s, const long size) {
    // check the size
    if (size <= 0) return nilp;
    // allocate the character buffer
    char* buf = new char[size*6+1];
    long  idx = 0;
    // loop in the buffer and encode
    for (long i = 0; i < size; i++) {
      // get the character value
      t_quad value = s[i];
      // encode the value
      if (value < 0x00000080) {
      buf[idx++] = (char) value;
      } else if (value < 0x00000800) {
      buf[idx++] = (char) (0x000000C0 | ((value >> 6)  & 0x0000001F));
      buf[idx++] = (char) (0x00000080 |  (value        & 0x0000003F));
      } else if (value < 0x00010000) {
      buf[idx++] = (char) (0x000000E0 | ((value >> 12) & 0x0000000F));
      buf[idx++] = (char) (0x00000080 | ((value >> 6)  & 0x0000003F));
      buf[idx++] = (char) (0x00000080 |  (value        & 0x0000003F));
      } else if (value < 0x00200000) {
      buf[idx++] = (char) (0x000000F0 | ((value >> 18) & 0x00000007));
      buf[idx++] = (char) (0x00000080 | ((value >> 12) & 0x0000003F));
      buf[idx++] = (char) (0x00000080 | ((value >> 6)  & 0x0000003F));
      buf[idx++] = (char) (0x00000080 |  (value        & 0x0000003F));
      } else if (value < 0x04000000) {
      buf[idx++] = (char) (0x000000F8 | ((value >> 24) & 0x00000003));
      buf[idx++] = (char) (0x00000080 | ((value >> 18) & 0x0000003F));
      buf[idx++] = (char) (0x00000080 | ((value >> 12) & 0x0000003F));
      buf[idx++] = (char) (0x00000080 | ((value >> 6)  & 0x0000003F));
      buf[idx++] = (char) (0x00000080 |  (value        & 0x0000003F));
      } else if (value < 0x80000000) {
      buf[idx++] = (char) (0x000000FC | ((value >> 30) & 0x00000001));
      buf[idx++] = (char) (0x00000080 | ((value >> 24) & 0x0000003F));
      buf[idx++] = (char) (0x00000080 | ((value >> 18) & 0x0000003F));
      buf[idx++] = (char) (0x00000080 | ((value >> 12) & 0x0000003F));
      buf[idx++] = (char) (0x00000080 | ((value >> 6)  & 0x0000003F));
      buf[idx++] = (char) (0x00000080 |  (value        & 0x0000003F));
      } else {
      throw Exception ("encode-error", 
                   "invalid character to encode in utf-8 mode");
      }
    }
    // add the nil character
    buf[idx++] = nilc;
    // here we are
    return buf;
  }

  // decode a unicode buffer

00801   t_quad* Unicode::decode (const char* s) {
    // get the size and decode
    long size = Ascii::strlen (s);
    return Unicode::decode (s, size); 
  }

  // decode a unicode buffer by size

00809   t_quad* Unicode::decode (const char* s, const long size) {
    // check the size
    if (size <= 0) return nilp;
    // allocate the quad buffer
    t_quad* buf = new t_quad[size+1];
    long    idx = 0;
    for (long i = 0; i < size; i++) {
      // read first byte
      t_byte b1 = (t_byte) s[i];
      // 1 byte mode
      if (b1 < 0x80) {
      buf[idx++] = (t_quad) b1;
      continue;
      }
      // 2 bytes mode
      if (b1 < 0xE0) {
      buf[idx] = ((t_quad) (b1 & 0x3F)) << 6;
      if (++i >= size) {
        throw Exception ("decode-error", 
                     "invalid eos while reading utf-8 sequence");
      }
      t_byte b2 = (t_byte) s[i];
      if ((b2 & 0x80) != 0x80) {
        throw Exception ("decode-error", 
                     "invalid byte while reading utf-8 sequence");
      }
      buf[idx] |= (t_quad) (b2 & 0x3F);
      if (buf[idx++] < 0x00000080) {
        throw Exception ("decode-error", "invalid long utf-8 sequence");
      }
      continue;
      }
      // 3 bytes mode
      if (b1 < 0xF0) {
      buf[idx] = ((t_quad) (b1 & 0x0F)) << 12;
      if (++i >= size) {
        throw Exception ("decode-error", 
                     "invalid eos while reading utf-8 sequence");
      }
      t_byte b2 = (t_byte) s[i];
      if ((b2 & 0x80) != 0x80) {
        throw Exception ("decode-error", 
                     "invalid byte while reading utf-8 sequence");
      }
      buf[idx] |= ((t_quad) (b2 & 0x3F)) << 6;
      if (++i >= size) {
        throw Exception ("decode-error", 
                     "invalid eos while reading utf-8 sequence");
      }
      t_byte b3 = (t_byte) s[i];
      if ((b3 & 0x80) != 0x80) {
        throw Exception ("decode-error", 
                     "invalid byte while reading utf-8 sequence");
      }
      buf[idx] |= (t_quad) (b3 & 0x3F);
      if (buf[idx++] < 0x00000800) {
        throw Exception ("decode-error", "invalid long utf-8 sequence");
      }
      continue;
      }
      // 4 bytes mode
      if (b1 < 0xF8) {
      buf[idx] = ((t_quad) (b1 & 0x07)) << 18;
      if (++i >= size) {
        throw Exception ("decode-error", 
                     "invalid eos while reading utf-8 sequence");
      }
      t_byte b2 = (t_byte) s[i];
      if ((b2 & 0x80) != 0x80) {
        throw Exception ("decode-error", 
                     "invalid byte while reading utf-8 sequence");
      }
      buf[idx] |= ((t_quad) (b2 & 0x3F)) << 12;
      if (++i >= size) {
        throw Exception ("decode-error", 
                     "invalid eos while reading utf-8 sequence");
      }
      t_byte b3 = (t_byte) s[i];
      if ((b3 & 0x80) != 0x80) {
        throw Exception ("decode-error", 
                     "invalid byte while reading utf-8 sequence");
      }
      buf[idx] |= (t_quad) (b3 & 0x3F) << 6;
      if (++i >= size) {
        throw Exception ("decode-error", 
                     "invalid eos while reading utf-8 sequence");
      }
      t_byte b4 = (t_byte) s[i];
      if ((b4 & 0x80) != 0x80) {
        throw Exception ("decode-error", 
                     "invalid byte while reading utf-8 sequence");
      }
      buf[idx] |= (t_quad) (b4 & 0x3F);
      if (buf[idx++] < 0x00010000) {
        throw Exception ("decode-error", "invalid long utf-8 sequence");
      }
      continue;
      }
      // 5 bytes mode
      if (b1 < 0xFC) {
      buf[idx] = ((t_quad) (b1 & 0x03)) << 24;
      if (++i >= size) {
        throw Exception ("decode-error", 
                     "invalid eos while reading utf-8 sequence");
      }
      t_byte b2 = (t_byte) s[i];
      if ((b2 & 0x80) != 0x80) {
        throw Exception ("decode-error", 
                     "invalid byte while reading utf-8 sequence");
      }
      buf[idx] |= ((t_quad) (b2 & 0x3F)) << 18;
      if (++i >= size) {
        throw Exception ("decode-error", 
                     "invalid eos while reading utf-8 sequence");
      }
      t_byte b3 = (t_byte) s[i];
      if ((b3 & 0x80) != 0x80) {
        throw Exception ("decode-error", 
                     "invalid byte while reading utf-8 sequence");
      }
      buf[idx] |= (t_quad) (b3 & 0x3F) << 12;
      if (++i >= size) {
        throw Exception ("decode-error", 
                     "invalid eos while reading utf-8 sequence");
      }
      t_byte b4 = (t_byte) s[i];
      if ((b4 & 0x80) != 0x80) {
        throw Exception ("decode-error", 
                     "invalid byte while reading utf-8 sequence");
      }
      buf[idx] |= (t_quad) (b4 & 0x3F) << 6;
      if (++i >= size) {
        throw Exception ("decode-error", 
                     "invalid eos while reading utf-8 sequence");
      }
      t_byte b5 = (t_byte) s[i];
      if ((b5 & 0x80) != 0x80) {
        throw Exception ("decode-error", 
                     "invalid byte while reading utf-8 sequence");
      }
      buf[idx] |= (t_quad) (b5 & 0x3F);
      if (buf[idx++] < 0x00200000) {
        throw Exception ("decode-error", "invalid long utf-8 sequence");
      }
      continue;
      }
      // 6 bytes mode
      if (b1 < 0xFE) {
      buf[idx] = ((t_quad) (b1 & 0x01)) << 30;
      if (++i >= size) {
        throw Exception ("decode-error", 
                     "invalid eos while reading utf-8 sequence");
      }
      t_byte b2 = (t_byte) s[i];
      if ((b2 & 0x80) != 0x80) {
        throw Exception ("decode-error", 
                     "invalid byte while reading utf-8 sequence");
      }
      buf[idx] |= ((t_quad) (b2 & 0x3F)) << 24;
      if (++i >= size) {
        throw Exception ("decode-error", 
                     "invalid eos while reading utf-8 sequence");
      }
      t_byte b3 = (t_byte) s[i];
      if ((b3 & 0x80) != 0x80) {
        throw Exception ("decode-error", 
                     "invalid byte while reading utf-8 sequence");
      }
      buf[idx] |= (t_quad) (b3 & 0x3F) << 18;
      if (++i >= size) {
        throw Exception ("decode-error", 
                     "invalid eos while reading utf-8 sequence");
      }
      t_byte b4 = (t_byte) s[i];
      if ((b4 & 0x80) != 0x80) {
        throw Exception ("decode-error", 
                     "invalid byte while reading utf-8 sequence");
      }
      buf[idx] |= (t_quad) (b4 & 0x3F) << 12;
      if (++i >= size) {
        throw Exception ("decode-error", 
                     "invalid eos while reading utf-8 sequence");
      }
      t_byte b5 = (t_byte) s[i];
      if ((b5 & 0x80) != 0x80) {
        throw Exception ("decode-error", 
                     "invalid byte while reading utf-8 sequence");
      }
      buf[idx] |= (t_quad) (b5 & 0x3F) << 6;
      if (++i >= size) {
        throw Exception ("decode-error", 
                     "invalid eos while reading utf-8 sequence");
      }
      t_byte b6 = (t_byte) s[i];
      if ((b6 & 0x80) != 0x80) {
        throw Exception ("decode-error", 
                     "invalid byte while reading utf-8 sequence");
      }
      buf[idx] |= (t_quad) (b6 & 0x3F);
      if (buf[idx++] < 0x04000000) {
        throw Exception ("decode-error", "invalid long utf-8 sequence");
      }
      continue;
      }
      throw Exception ("decode-error", "invalid utf-8 character sequence");
    }
    // add the nil quad
    buf[idx++] = nilq;
    // here we are
    return buf;
  }
}

Generated by  Doxygen 1.6.0   Back to index