PerlRegEx.cpp

Go to the documentation of this file.
00001 /*******************************************************************************
00002 * Copyright (C) 2005 Novell, Inc. All rights reserved.
00003 *
00004 * Redistribution and use in source and binary forms, with or without
00005 * modification, are permitted provided that the following conditions are met:
00006 *
00007 *  - Redistributions of source code must retain the above copyright notice,
00008 *    this list of conditions and the following disclaimer.
00009 *
00010 *  - Redistributions in binary form must reproduce the above copyright notice,
00011 *    this list of conditions and the following disclaimer in the documentation
00012 *    and/or other materials provided with the distribution.
00013 *
00014 *  - Neither the name of Vintela, Inc., Novell, Inc., nor the names of its
00015 *    contributors may be used to endorse or promote products derived from this
00016 *    software without specific prior written permission.
00017 *
00018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
00019 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00020 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00021 * ARE DISCLAIMED. IN NO EVENT SHALL Vintela, Inc., Novell, Inc., OR THE 
00022 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
00023 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
00024 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
00025 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
00026 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
00027 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
00028 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029 *******************************************************************************/
00034 #include "blocxx/PerlRegEx.hpp"
00035 
00036 #ifdef BLOCXX_HAVE_PCRE
00037 #ifdef BLOCXX_HAVE_PCRE_H
00038 
00039 #include "blocxx/ExceptionIds.hpp"
00040 #include "blocxx/Assertion.hpp"
00041 #include "blocxx/Format.hpp"
00042 
00043 
00044 namespace BLOCXX_NAMESPACE
00045 {
00046 
00047 
00048 // -------------------------------------------------------------------
00049 static String
00050 substitute_caps(const PerlRegEx::MatchArray &sub,
00051                 const String &str, const String &rep)
00052 {
00053    static const char *cap_refs[] = {
00054       NULL,  "\\1", "\\2", "\\3", "\\4",
00055       "\\5", "\\6", "\\7", "\\8", "\\9", NULL
00056    };
00057 
00058    String res( rep);
00059    size_t pos;
00060 
00061    for(size_t i=1; cap_refs[i] != NULL; i++)
00062    {
00063       String cap;
00064       if( i < sub.size() && sub[i].rm_so >= 0 && sub[i].rm_eo >= 0)
00065       {
00066          cap = str.substring(sub[i].rm_so, sub[i].rm_eo
00067                                          - sub[i].rm_so);
00068       }
00069 
00070       pos = res.indexOf(cap_refs[i]);
00071       while( pos != String::npos)
00072       {
00073          size_t quotes = 0;
00074          size_t at = pos;
00075 
00076          while( at > 0 && res.charAt(--at) == '\\')
00077             quotes++;
00078 
00079          if( quotes % 2)
00080          {
00081             quotes = (quotes + 1) / 2;
00082 
00083             res = res.erase(pos - quotes, quotes);
00084 
00085             pos = res.indexOf(cap_refs[i],
00086                               pos + 2 - quotes);
00087          }
00088          else
00089          {
00090             quotes = quotes / 2;
00091 
00092             res = res.substring(0, pos - quotes) +
00093                   cap +
00094                   res.substring(pos + 2);
00095 
00096             pos = res.indexOf(cap_refs[i],
00097                               pos + cap.length() - quotes);
00098          }
00099       }
00100    }
00101    return res;
00102 }
00103 
00104 
00105 // -------------------------------------------------------------------
00106 static inline String
00107 getError(const int errcode)
00108 {
00109    const char *ptr;
00110    switch(errcode)
00111    {
00112       case 0:
00113          ptr = "match vector to small";
00114       break;
00115 
00116       case PCRE_ERROR_NOMATCH:
00117          ptr = "match failed";
00118       break;
00119 
00120       case PCRE_ERROR_NULL:
00121          ptr = "invalid argument";
00122       break;
00123 
00124       case PCRE_ERROR_BADOPTION:
00125          ptr = "unrecognized option";
00126       break;
00127 
00128       case PCRE_ERROR_BADMAGIC:
00129          ptr = "invalid magic number";
00130       break;
00131 
00132       case PCRE_ERROR_UNKNOWN_NODE:
00133          ptr = "unknown item in the compiled pattern";
00134       break;
00135 
00136       case PCRE_ERROR_NOMEMORY:
00137          ptr = "failed to allocate memory";
00138       break;
00139 
00140       case PCRE_ERROR_NOSUBSTRING:
00141          // .*_substring.* functions only
00142          ptr = "failed to retrieve substring";
00143       break;
00144 
00145       case PCRE_ERROR_MATCHLIMIT:
00146          // match_limit in pcre_extra struct
00147          ptr = "recursion or backtracking limit reached";
00148       break;
00149 
00150       case PCRE_ERROR_CALLOUT:
00151          // reserved for pcrecallout functions
00152          ptr = "callout failure";
00153       break;
00154 
00155       case PCRE_ERROR_BADUTF8:
00156          ptr = "invalid UTF-8 byte sequence found";
00157       break;
00158 
00159       case PCRE_ERROR_BADUTF8_OFFSET:
00160          ptr = "not a UTF-8 character at specified index";
00161       break;
00162 
00163       case PCRE_ERROR_PARTIAL:
00164          ptr = "partial match";
00165       break;
00166 
00167       case PCRE_ERROR_BADPARTIAL:
00168          ptr = "pattern item not supported for partial matching";
00169       break;
00170 
00171       case PCRE_ERROR_INTERNAL:
00172          ptr = "unexpected internal error occurred";
00173       break;
00174 
00175       case PCRE_ERROR_BADCOUNT:
00176          ptr = "invalid (negative) match vector count";
00177       break;
00178 
00179       default:
00180          ptr = "unknown error code";
00181       break;
00182    }
00183    return String(ptr);
00184 }
00185 
00186 // -------------------------------------------------------------------
00187 PerlRegEx::PerlRegEx()
00188    : m_pcre(NULL)
00189    , m_flags(0)
00190    , m_ecode(0)
00191 {
00192 }
00193 
00194 
00195 // -------------------------------------------------------------------
00196 PerlRegEx::PerlRegEx(const String &regex, int cflags)
00197    : m_pcre(NULL)
00198    , m_flags(0)
00199    , m_ecode(0)
00200 {
00201    if( !compile(regex, cflags))
00202    {
00203       BLOCXX_THROW_ERR(RegExCompileException,
00204          errorString().c_str(), m_ecode);
00205    }
00206 }
00207 
00208 
00209 // -------------------------------------------------------------------
00210 PerlRegEx::PerlRegEx(const PerlRegEx &ref)
00211    : m_pcre(NULL)
00212    , m_flags(ref.m_flags)
00213    , m_ecode(0)
00214    , m_rxstr(ref.m_rxstr)
00215 {
00216    if( ref.m_pcre != NULL && !compile(ref.m_rxstr, ref.m_flags))
00217    {
00218       BLOCXX_THROW_ERR(RegExCompileException,
00219          errorString().c_str(), m_ecode);
00220    }
00221 }
00222 
00223 // -------------------------------------------------------------------
00224 PerlRegEx::~PerlRegEx()
00225 {
00226    if( m_pcre)
00227    {
00228       free(m_pcre);
00229       m_pcre = NULL;
00230    }
00231 }
00232 
00233 
00234 // -------------------------------------------------------------------
00235 PerlRegEx &
00236 PerlRegEx::operator = (const PerlRegEx &ref)
00237 {
00238    if( ref.m_pcre == NULL)
00239    {
00240       m_ecode = 0;
00241       m_error.erase();
00242       m_flags = ref.m_flags;
00243       m_rxstr = ref.m_rxstr;
00244       if( m_pcre != NULL)
00245       {
00246          free(m_pcre);
00247          m_pcre = NULL;
00248       }
00249    }
00250    else if( !compile(ref.m_rxstr, ref.m_flags))
00251    {
00252       BLOCXX_THROW_ERR(RegExCompileException,
00253          errorString().c_str(), m_ecode);
00254    }
00255    return *this;
00256 }
00257 
00258 
00259 // -------------------------------------------------------------------
00260 bool
00261 PerlRegEx::compile(const String &regex, int cflags)
00262 {
00263    if( m_pcre)
00264    {
00265       free(m_pcre);
00266       m_pcre = NULL;
00267    }
00268 
00269    const char *errptr = NULL;
00270 
00271    m_ecode = 0;
00272    m_pcre  = ::pcre_compile(regex.c_str(), cflags,
00273                             &errptr, &m_ecode, NULL);
00274    if( m_pcre == NULL)
00275    {
00276       m_error = String(errptr ? errptr : "");
00277       m_rxstr.erase();
00278       m_flags = 0;
00279       return false;
00280    }
00281    else
00282    {
00283       m_error.erase();
00284       m_rxstr = regex;
00285       m_flags = cflags;
00286       return true;
00287    }
00288 }
00289 
00290 
00291 // -------------------------------------------------------------------
00292 int
00293 PerlRegEx::errorCode()
00294 {
00295    return m_ecode;
00296 }
00297 
00298 
00299 // -------------------------------------------------------------------
00300 String
00301 PerlRegEx::errorString() const
00302 {
00303    return m_error;
00304 }
00305 
00306 
00307 // -------------------------------------------------------------------
00308 String
00309 PerlRegEx::patternString() const
00310 {
00311    return m_rxstr;
00312 }
00313 
00314 
00315 // -------------------------------------------------------------------
00316 int
00317 PerlRegEx::compileFlags() const
00318 {
00319    return m_flags;
00320 }
00321 
00322 
00323 // -------------------------------------------------------------------
00324 bool
00325 PerlRegEx::isCompiled() const
00326 {
00327    return (m_pcre != NULL);
00328 }
00329 
00330 
00331 // -------------------------------------------------------------------
00332 bool
00333 PerlRegEx::execute(MatchArray &sub, const String &str,
00334                size_t index, size_t count, int eflags)
00335 {
00336    if( m_pcre == NULL)
00337    {
00338       BLOCXX_THROW(RegExCompileException,
00339          "Regular expression is not compiled");
00340    }
00341    if( count >= size_t(INT_MAX / 3))
00342    {
00343       BLOCXX_THROW(AssertionException,
00344          "Match count limit exceeded");
00345    }
00346 
00347    if( index > str.length())
00348    {
00349       BLOCXX_THROW(OutOfBoundsException,
00350          Format("String index out of bounds ("
00351                 "length = %1, index = %2).",
00352                 str.length(), index
00353          ).c_str());
00354    }
00355 
00356    if( count == 0)
00357    {
00358       int cnt = 0;
00359       int ret = ::pcre_fullinfo(m_pcre, NULL,
00360                                 PCRE_INFO_CAPTURECOUNT, &cnt);
00361       if( ret)
00362       {
00363          m_error = getError(m_ecode);
00364          return false;
00365       }
00366       count = cnt > 0 ? cnt + 1 : 1;
00367    }
00368    int vsub[count * 3];
00369 
00370    sub.clear();
00371    m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(), str.length(),
00372                          index, eflags, vsub, count * 3);
00373    //
00374    // pcre_exec returns 0 if vector too small, negative value
00375    // on errors or the number of matches (number of int pairs)
00376    //
00377    if( m_ecode > 0)
00378    {
00379       sub.resize(count); // as specified by user
00380       for(size_t i = 0, n = 0; i < count; i++, n += 2)
00381       {
00382          match_t  m = { vsub[n], vsub[n+1] };
00383 
00384          // if user wants more than detected
00385          if( i >= (size_t)m_ecode)
00386             m.rm_so = m.rm_eo = -1;
00387 
00388          sub[i] = m;
00389       }
00390       m_error.erase();
00391       return true;
00392    }
00393    else
00394    {
00395       m_error = getError(m_ecode);
00396       return false;
00397    }
00398 }
00399 
00400 
00401 // -------------------------------------------------------------------
00402 bool
00403 PerlRegEx::execute(MatchVector &sub, const String &str,
00404                size_t index, size_t count, int eflags)
00405 {
00406    if( m_pcre == NULL)
00407    {
00408       BLOCXX_THROW(RegExCompileException,
00409          "Regular expression is not compiled");
00410    }
00411    if( count >= size_t(INT_MAX / 3))
00412    {
00413       BLOCXX_THROW(AssertionException,
00414          "Match count limit exceeded");
00415    }
00416 
00417    if( index > str.length())
00418    {
00419       BLOCXX_THROW(OutOfBoundsException,
00420          Format("String index out of bounds ("
00421                 "length = %1, index = %2)",
00422                 str.length(), index
00423          ).c_str());
00424    }
00425 
00426    if( count == 0)
00427    {
00428       int cnt = 0;
00429       int ret = ::pcre_fullinfo(m_pcre, NULL,
00430                                 PCRE_INFO_CAPTURECOUNT, &cnt);
00431       if( ret)
00432       {
00433          m_error = getError(m_ecode);
00434          return false;
00435       }
00436       count = cnt > 0 ? cnt + 1 : 1;
00437    }
00438    int vsub[count * 3];
00439 
00440    sub.clear();
00441    m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(), str.length(),
00442                          index, eflags, vsub, count * 3);
00443    //
00444    // pcre_exec returns 0 if vector too small, negative value
00445    // on errors or the number of matches (number of int pairs)
00446    //
00447    if( m_ecode > 0)
00448    {
00449       count   *= 2;
00450       m_ecode *= 2;
00451       sub.resize(count); // as specified by user
00452       for(size_t i = 0; i < count; i++)
00453       {
00454          // if user wants more than detected
00455          if( i >= (size_t)m_ecode)
00456             vsub[i] = -1;
00457 
00458          sub[i] = vsub[i];
00459       }
00460       return true;
00461    }
00462    else
00463    {
00464       m_error = getError(m_ecode);
00465       return false;
00466    }
00467 }
00468 
00469 
00470 // -------------------------------------------------------------------
00471 StringArray
00472 PerlRegEx::capture(const String &str, size_t index, size_t count, int eflags)
00473 {
00474    if( m_pcre == NULL)
00475    {
00476       BLOCXX_THROW(RegExCompileException,
00477          "Regular expression is not compiled");
00478    }
00479 
00480    MatchArray  rsub;
00481    StringArray ssub;
00482 
00483    bool match = execute(rsub, str, index, count, eflags);
00484    if( match)
00485    {
00486       if( rsub.empty())
00487       {
00488          BLOCXX_THROW(RegExCompileException,
00489             "Non-capturing regular expression");
00490       }
00491 
00492       MatchArray::const_iterator i=rsub.begin();
00493       for( ; i != rsub.end(); ++i)
00494       {
00495          if( i->rm_so >= 0 && i->rm_eo >= 0)
00496          {
00497             ssub.push_back(str.substring(i->rm_so,
00498                            i->rm_eo - i->rm_so));
00499          }
00500          else
00501          {
00502             ssub.push_back(String(""));
00503          }
00504       }
00505    }
00506    else if(m_ecode != PCRE_ERROR_NOMATCH)
00507    {
00508       BLOCXX_THROW_ERR(RegExExecuteException,
00509          errorString().c_str(), m_ecode);
00510    }
00511    return ssub;
00512 }
00513 
00514 
00515 // -------------------------------------------------------------------
00516 blocxx::String
00517 PerlRegEx::replace(const String &str, const String &rep,
00518                    bool global, int eflags)
00519 {
00520    if( m_pcre == NULL)
00521    {
00522       BLOCXX_THROW(RegExCompileException,
00523          "Regular expression is not compiled");
00524    }
00525 
00526    MatchArray  rsub;
00527    bool        match;
00528    size_t      off = 0;
00529    String      out = str;
00530 
00531    do
00532    {
00533       match = execute(rsub, out, off, 0, eflags);
00534       if( match)
00535       {
00536          if( rsub.empty()      ||
00537              rsub[0].rm_so < 0 ||
00538              rsub[0].rm_eo < 0)
00539          {
00540             // only if empty (missused as guard).
00541             BLOCXX_THROW(RegExCompileException,
00542                "Non-capturing regular expression");
00543          }
00544 
00545          String res = substitute_caps(rsub, out, rep);
00546 
00547          out = out.substring(0, rsub[0].rm_so) +
00548                res + out.substring(rsub[0].rm_eo);
00549 
00550          off = rsub[0].rm_so + res.length();
00551       }
00552       else if(m_ecode == PCRE_ERROR_NOMATCH)
00553       {
00554          m_ecode = 0;
00555          m_error.erase();
00556       }
00557       else
00558       {
00559          BLOCXX_THROW_ERR(RegExExecuteException,
00560             errorString().c_str(), m_ecode);
00561       }
00562    } while(global && match && out.length() > off);
00563 
00564    return out;
00565 }
00566 
00567 
00568 // -------------------------------------------------------------------
00569 StringArray
00570 PerlRegEx::split(const String &str, bool empty, int eflags)
00571 {
00572    if( m_pcre == NULL)
00573    {
00574       BLOCXX_THROW(RegExCompileException,
00575          "Regular expression is not compiled");
00576    }
00577 
00578    MatchArray  rsub;
00579    StringArray ssub;
00580    bool        match;
00581    size_t      off = 0;
00582    size_t      len = str.length();
00583 
00584    do
00585    {
00586       match = execute(rsub, str, off, 0, eflags);
00587       if( match)
00588       {
00589          if( rsub.empty()      ||
00590              rsub[0].rm_so < 0 ||
00591              rsub[0].rm_eo < 0)
00592          {
00593             BLOCXX_THROW(RegExCompileException,
00594                "Non-capturing regular expression");
00595          }
00596 
00597          if( empty || ((size_t)rsub[0].rm_so > off))
00598          {
00599             ssub.push_back(str.substring(off,
00600                                rsub[0].rm_so - off));
00601          }
00602          off = rsub[0].rm_eo;
00603       }
00604       else if(m_ecode == PCRE_ERROR_NOMATCH)
00605       {
00606          String tmp = str.substring(off);
00607          if( empty || !tmp.empty())
00608          {
00609             ssub.push_back(tmp);
00610          }
00611          m_ecode = 0;
00612          m_error.erase();
00613       }
00614       else
00615       {
00616          BLOCXX_THROW_ERR(RegExExecuteException,
00617             errorString().c_str(), m_ecode);
00618       }
00619    } while(match && len > off);
00620 
00621    return ssub;
00622 }
00623 
00624 
00625 // -------------------------------------------------------------------
00626 StringArray
00627 PerlRegEx::grep(const StringArray &src, int eflags)
00628 {
00629    if( m_pcre == NULL)
00630    {
00631       BLOCXX_THROW(RegExCompileException,
00632          "Regular expression is not compiled");
00633    }
00634 
00635    m_ecode = 0;
00636    m_error.erase();
00637 
00638    StringArray out;
00639    if( !src.empty())
00640    {
00641       StringArray::const_iterator i=src.begin();
00642       for( ; i != src.end(); ++i)
00643       {
00644          int ret = ::pcre_exec(m_pcre, NULL, i->c_str(),
00645                    i->length(), 0, eflags, NULL, 0);
00646          if( ret >= 0)
00647          {
00648             out.push_back(*i);
00649          }
00650          else if( ret != PCRE_ERROR_NOMATCH)
00651          {
00652             m_ecode = ret;
00653             m_error = getError(m_ecode);
00654             BLOCXX_THROW_ERR(RegExExecuteException,
00655                errorString().c_str(), m_ecode);
00656          }
00657       }
00658    }
00659    return out;
00660 }
00661 
00662 
00663 // -------------------------------------------------------------------
00664 bool
00665 PerlRegEx::match(const String &str, size_t index, int eflags) const
00666 {
00667    if( m_pcre == NULL)
00668    {
00669       BLOCXX_THROW(RegExCompileException,
00670          "Regular expression is not compiled");
00671    }
00672 
00673    if( index > str.length())
00674    {
00675       BLOCXX_THROW(OutOfBoundsException,
00676          Format("String index out of bounds."
00677                 "length = %1, index = %2",
00678                 str.length(), index
00679          ).c_str());
00680    }
00681 
00682    m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(),
00683              str.length(), 0, eflags, NULL, 0);
00684    if( m_ecode >= 0)
00685    {
00686       m_error.erase();
00687       return true;
00688    }
00689    else if( m_ecode == PCRE_ERROR_NOMATCH)
00690    {
00691       m_error = getError(m_ecode);
00692       return false;
00693    }
00694    else
00695    {
00696       m_error = getError(m_ecode);
00697       BLOCXX_THROW_ERR(RegExExecuteException,
00698          errorString().c_str(), m_ecode);
00699    }
00700 }
00701 
00702 
00703 // -------------------------------------------------------------------
00704 } // namespace BLOCXX_NAMESPACE
00705 
00706 #endif // BLOCXX_HAVE_PCRE_H
00707 #endif // BLOCXX_HAVE_PCRE
00708 
00709 /* vim: set ts=8 sts=8 sw=8 ai noet: */
00710 

Generated on Fri Jun 16 15:39:08 2006 for blocxx by  doxygen 1.4.6