00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00034 #include "blocxx/PerlRegEx.hpp"
00035
00036 #ifdef BLOCXX_HAVE_PCRE
00037 #ifdef BLOCXX_HAVE_PCRE_H
00038
00039 #include "blocxx/ExceptionIds.hpp"
00040 #include "blocxx/Assertion.hpp"
00041 #include "blocxx/Format.hpp"
00042
00043
00044 namespace BLOCXX_NAMESPACE
00045 {
00046
00047
00048
00049 static String
00050 substitute_caps(const PerlRegEx::MatchArray &sub,
00051 const String &str, const String &rep)
00052 {
00053 static const char *cap_refs[] = {
00054 NULL, "\\1", "\\2", "\\3", "\\4",
00055 "\\5", "\\6", "\\7", "\\8", "\\9", NULL
00056 };
00057
00058 String res( rep);
00059 size_t pos;
00060
00061 for(size_t i=1; cap_refs[i] != NULL; i++)
00062 {
00063 String cap;
00064 if( i < sub.size() && sub[i].rm_so >= 0 && sub[i].rm_eo >= 0)
00065 {
00066 cap = str.substring(sub[i].rm_so, sub[i].rm_eo
00067 - sub[i].rm_so);
00068 }
00069
00070 pos = res.indexOf(cap_refs[i]);
00071 while( pos != String::npos)
00072 {
00073 size_t quotes = 0;
00074 size_t at = pos;
00075
00076 while( at > 0 && res.charAt(--at) == '\\')
00077 quotes++;
00078
00079 if( quotes % 2)
00080 {
00081 quotes = (quotes + 1) / 2;
00082
00083 res = res.erase(pos - quotes, quotes);
00084
00085 pos = res.indexOf(cap_refs[i],
00086 pos + 2 - quotes);
00087 }
00088 else
00089 {
00090 quotes = quotes / 2;
00091
00092 res = res.substring(0, pos - quotes) +
00093 cap +
00094 res.substring(pos + 2);
00095
00096 pos = res.indexOf(cap_refs[i],
00097 pos + cap.length() - quotes);
00098 }
00099 }
00100 }
00101 return res;
00102 }
00103
00104
00105
00106 static inline String
00107 getError(const int errcode)
00108 {
00109 const char *ptr;
00110 switch(errcode)
00111 {
00112 case 0:
00113 ptr = "match vector to small";
00114 break;
00115
00116 case PCRE_ERROR_NOMATCH:
00117 ptr = "match failed";
00118 break;
00119
00120 case PCRE_ERROR_NULL:
00121 ptr = "invalid argument";
00122 break;
00123
00124 case PCRE_ERROR_BADOPTION:
00125 ptr = "unrecognized option";
00126 break;
00127
00128 case PCRE_ERROR_BADMAGIC:
00129 ptr = "invalid magic number";
00130 break;
00131
00132 case PCRE_ERROR_UNKNOWN_NODE:
00133 ptr = "unknown item in the compiled pattern";
00134 break;
00135
00136 case PCRE_ERROR_NOMEMORY:
00137 ptr = "failed to allocate memory";
00138 break;
00139
00140 case PCRE_ERROR_NOSUBSTRING:
00141
00142 ptr = "failed to retrieve substring";
00143 break;
00144
00145 case PCRE_ERROR_MATCHLIMIT:
00146
00147 ptr = "recursion or backtracking limit reached";
00148 break;
00149
00150 case PCRE_ERROR_CALLOUT:
00151
00152 ptr = "callout failure";
00153 break;
00154
00155 case PCRE_ERROR_BADUTF8:
00156 ptr = "invalid UTF-8 byte sequence found";
00157 break;
00158
00159 case PCRE_ERROR_BADUTF8_OFFSET:
00160 ptr = "not a UTF-8 character at specified index";
00161 break;
00162
00163 case PCRE_ERROR_PARTIAL:
00164 ptr = "partial match";
00165 break;
00166
00167 case PCRE_ERROR_BADPARTIAL:
00168 ptr = "pattern item not supported for partial matching";
00169 break;
00170
00171 case PCRE_ERROR_INTERNAL:
00172 ptr = "unexpected internal error occurred";
00173 break;
00174
00175 case PCRE_ERROR_BADCOUNT:
00176 ptr = "invalid (negative) match vector count";
00177 break;
00178
00179 default:
00180 ptr = "unknown error code";
00181 break;
00182 }
00183 return String(ptr);
00184 }
00185
00186
00187 PerlRegEx::PerlRegEx()
00188 : m_pcre(NULL)
00189 , m_flags(0)
00190 , m_ecode(0)
00191 {
00192 }
00193
00194
00195
00196 PerlRegEx::PerlRegEx(const String ®ex, int cflags)
00197 : m_pcre(NULL)
00198 , m_flags(0)
00199 , m_ecode(0)
00200 {
00201 if( !compile(regex, cflags))
00202 {
00203 BLOCXX_THROW_ERR(RegExCompileException,
00204 errorString().c_str(), m_ecode);
00205 }
00206 }
00207
00208
00209
00210 PerlRegEx::PerlRegEx(const PerlRegEx &ref)
00211 : m_pcre(NULL)
00212 , m_flags(ref.m_flags)
00213 , m_ecode(0)
00214 , m_rxstr(ref.m_rxstr)
00215 {
00216 if( ref.m_pcre != NULL && !compile(ref.m_rxstr, ref.m_flags))
00217 {
00218 BLOCXX_THROW_ERR(RegExCompileException,
00219 errorString().c_str(), m_ecode);
00220 }
00221 }
00222
00223
00224 PerlRegEx::~PerlRegEx()
00225 {
00226 if( m_pcre)
00227 {
00228 free(m_pcre);
00229 m_pcre = NULL;
00230 }
00231 }
00232
00233
00234
00235 PerlRegEx &
00236 PerlRegEx::operator = (const PerlRegEx &ref)
00237 {
00238 if( ref.m_pcre == NULL)
00239 {
00240 m_ecode = 0;
00241 m_error.erase();
00242 m_flags = ref.m_flags;
00243 m_rxstr = ref.m_rxstr;
00244 if( m_pcre != NULL)
00245 {
00246 free(m_pcre);
00247 m_pcre = NULL;
00248 }
00249 }
00250 else if( !compile(ref.m_rxstr, ref.m_flags))
00251 {
00252 BLOCXX_THROW_ERR(RegExCompileException,
00253 errorString().c_str(), m_ecode);
00254 }
00255 return *this;
00256 }
00257
00258
00259
00260 bool
00261 PerlRegEx::compile(const String ®ex, int cflags)
00262 {
00263 if( m_pcre)
00264 {
00265 free(m_pcre);
00266 m_pcre = NULL;
00267 }
00268
00269 const char *errptr = NULL;
00270
00271 m_ecode = 0;
00272 m_pcre = ::pcre_compile(regex.c_str(), cflags,
00273 &errptr, &m_ecode, NULL);
00274 if( m_pcre == NULL)
00275 {
00276 m_error = String(errptr ? errptr : "");
00277 m_rxstr.erase();
00278 m_flags = 0;
00279 return false;
00280 }
00281 else
00282 {
00283 m_error.erase();
00284 m_rxstr = regex;
00285 m_flags = cflags;
00286 return true;
00287 }
00288 }
00289
00290
00291
00292 int
00293 PerlRegEx::errorCode()
00294 {
00295 return m_ecode;
00296 }
00297
00298
00299
00300 String
00301 PerlRegEx::errorString() const
00302 {
00303 return m_error;
00304 }
00305
00306
00307
00308 String
00309 PerlRegEx::patternString() const
00310 {
00311 return m_rxstr;
00312 }
00313
00314
00315
00316 int
00317 PerlRegEx::compileFlags() const
00318 {
00319 return m_flags;
00320 }
00321
00322
00323
00324 bool
00325 PerlRegEx::isCompiled() const
00326 {
00327 return (m_pcre != NULL);
00328 }
00329
00330
00331
00332 bool
00333 PerlRegEx::execute(MatchArray &sub, const String &str,
00334 size_t index, size_t count, int eflags)
00335 {
00336 if( m_pcre == NULL)
00337 {
00338 BLOCXX_THROW(RegExCompileException,
00339 "Regular expression is not compiled");
00340 }
00341 if( count >= size_t(INT_MAX / 3))
00342 {
00343 BLOCXX_THROW(AssertionException,
00344 "Match count limit exceeded");
00345 }
00346
00347 if( index > str.length())
00348 {
00349 BLOCXX_THROW(OutOfBoundsException,
00350 Format("String index out of bounds ("
00351 "length = %1, index = %2).",
00352 str.length(), index
00353 ).c_str());
00354 }
00355
00356 if( count == 0)
00357 {
00358 int cnt = 0;
00359 int ret = ::pcre_fullinfo(m_pcre, NULL,
00360 PCRE_INFO_CAPTURECOUNT, &cnt);
00361 if( ret)
00362 {
00363 m_error = getError(m_ecode);
00364 return false;
00365 }
00366 count = cnt > 0 ? cnt + 1 : 1;
00367 }
00368 int vsub[count * 3];
00369
00370 sub.clear();
00371 m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(), str.length(),
00372 index, eflags, vsub, count * 3);
00373
00374
00375
00376
00377 if( m_ecode > 0)
00378 {
00379 sub.resize(count);
00380 for(size_t i = 0, n = 0; i < count; i++, n += 2)
00381 {
00382 match_t m = { vsub[n], vsub[n+1] };
00383
00384
00385 if( i >= (size_t)m_ecode)
00386 m.rm_so = m.rm_eo = -1;
00387
00388 sub[i] = m;
00389 }
00390 m_error.erase();
00391 return true;
00392 }
00393 else
00394 {
00395 m_error = getError(m_ecode);
00396 return false;
00397 }
00398 }
00399
00400
00401
00402 bool
00403 PerlRegEx::execute(MatchVector &sub, const String &str,
00404 size_t index, size_t count, int eflags)
00405 {
00406 if( m_pcre == NULL)
00407 {
00408 BLOCXX_THROW(RegExCompileException,
00409 "Regular expression is not compiled");
00410 }
00411 if( count >= size_t(INT_MAX / 3))
00412 {
00413 BLOCXX_THROW(AssertionException,
00414 "Match count limit exceeded");
00415 }
00416
00417 if( index > str.length())
00418 {
00419 BLOCXX_THROW(OutOfBoundsException,
00420 Format("String index out of bounds ("
00421 "length = %1, index = %2)",
00422 str.length(), index
00423 ).c_str());
00424 }
00425
00426 if( count == 0)
00427 {
00428 int cnt = 0;
00429 int ret = ::pcre_fullinfo(m_pcre, NULL,
00430 PCRE_INFO_CAPTURECOUNT, &cnt);
00431 if( ret)
00432 {
00433 m_error = getError(m_ecode);
00434 return false;
00435 }
00436 count = cnt > 0 ? cnt + 1 : 1;
00437 }
00438 int vsub[count * 3];
00439
00440 sub.clear();
00441 m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(), str.length(),
00442 index, eflags, vsub, count * 3);
00443
00444
00445
00446
00447 if( m_ecode > 0)
00448 {
00449 count *= 2;
00450 m_ecode *= 2;
00451 sub.resize(count);
00452 for(size_t i = 0; i < count; i++)
00453 {
00454
00455 if( i >= (size_t)m_ecode)
00456 vsub[i] = -1;
00457
00458 sub[i] = vsub[i];
00459 }
00460 return true;
00461 }
00462 else
00463 {
00464 m_error = getError(m_ecode);
00465 return false;
00466 }
00467 }
00468
00469
00470
00471 StringArray
00472 PerlRegEx::capture(const String &str, size_t index, size_t count, int eflags)
00473 {
00474 if( m_pcre == NULL)
00475 {
00476 BLOCXX_THROW(RegExCompileException,
00477 "Regular expression is not compiled");
00478 }
00479
00480 MatchArray rsub;
00481 StringArray ssub;
00482
00483 bool match = execute(rsub, str, index, count, eflags);
00484 if( match)
00485 {
00486 if( rsub.empty())
00487 {
00488 BLOCXX_THROW(RegExCompileException,
00489 "Non-capturing regular expression");
00490 }
00491
00492 MatchArray::const_iterator i=rsub.begin();
00493 for( ; i != rsub.end(); ++i)
00494 {
00495 if( i->rm_so >= 0 && i->rm_eo >= 0)
00496 {
00497 ssub.push_back(str.substring(i->rm_so,
00498 i->rm_eo - i->rm_so));
00499 }
00500 else
00501 {
00502 ssub.push_back(String(""));
00503 }
00504 }
00505 }
00506 else if(m_ecode != PCRE_ERROR_NOMATCH)
00507 {
00508 BLOCXX_THROW_ERR(RegExExecuteException,
00509 errorString().c_str(), m_ecode);
00510 }
00511 return ssub;
00512 }
00513
00514
00515
00516 blocxx::String
00517 PerlRegEx::replace(const String &str, const String &rep,
00518 bool global, int eflags)
00519 {
00520 if( m_pcre == NULL)
00521 {
00522 BLOCXX_THROW(RegExCompileException,
00523 "Regular expression is not compiled");
00524 }
00525
00526 MatchArray rsub;
00527 bool match;
00528 size_t off = 0;
00529 String out = str;
00530
00531 do
00532 {
00533 match = execute(rsub, out, off, 0, eflags);
00534 if( match)
00535 {
00536 if( rsub.empty() ||
00537 rsub[0].rm_so < 0 ||
00538 rsub[0].rm_eo < 0)
00539 {
00540
00541 BLOCXX_THROW(RegExCompileException,
00542 "Non-capturing regular expression");
00543 }
00544
00545 String res = substitute_caps(rsub, out, rep);
00546
00547 out = out.substring(0, rsub[0].rm_so) +
00548 res + out.substring(rsub[0].rm_eo);
00549
00550 off = rsub[0].rm_so + res.length();
00551 }
00552 else if(m_ecode == PCRE_ERROR_NOMATCH)
00553 {
00554 m_ecode = 0;
00555 m_error.erase();
00556 }
00557 else
00558 {
00559 BLOCXX_THROW_ERR(RegExExecuteException,
00560 errorString().c_str(), m_ecode);
00561 }
00562 } while(global && match && out.length() > off);
00563
00564 return out;
00565 }
00566
00567
00568
00569 StringArray
00570 PerlRegEx::split(const String &str, bool empty, int eflags)
00571 {
00572 if( m_pcre == NULL)
00573 {
00574 BLOCXX_THROW(RegExCompileException,
00575 "Regular expression is not compiled");
00576 }
00577
00578 MatchArray rsub;
00579 StringArray ssub;
00580 bool match;
00581 size_t off = 0;
00582 size_t len = str.length();
00583
00584 do
00585 {
00586 match = execute(rsub, str, off, 0, eflags);
00587 if( match)
00588 {
00589 if( rsub.empty() ||
00590 rsub[0].rm_so < 0 ||
00591 rsub[0].rm_eo < 0)
00592 {
00593 BLOCXX_THROW(RegExCompileException,
00594 "Non-capturing regular expression");
00595 }
00596
00597 if( empty || ((size_t)rsub[0].rm_so > off))
00598 {
00599 ssub.push_back(str.substring(off,
00600 rsub[0].rm_so - off));
00601 }
00602 off = rsub[0].rm_eo;
00603 }
00604 else if(m_ecode == PCRE_ERROR_NOMATCH)
00605 {
00606 String tmp = str.substring(off);
00607 if( empty || !tmp.empty())
00608 {
00609 ssub.push_back(tmp);
00610 }
00611 m_ecode = 0;
00612 m_error.erase();
00613 }
00614 else
00615 {
00616 BLOCXX_THROW_ERR(RegExExecuteException,
00617 errorString().c_str(), m_ecode);
00618 }
00619 } while(match && len > off);
00620
00621 return ssub;
00622 }
00623
00624
00625
00626 StringArray
00627 PerlRegEx::grep(const StringArray &src, int eflags)
00628 {
00629 if( m_pcre == NULL)
00630 {
00631 BLOCXX_THROW(RegExCompileException,
00632 "Regular expression is not compiled");
00633 }
00634
00635 m_ecode = 0;
00636 m_error.erase();
00637
00638 StringArray out;
00639 if( !src.empty())
00640 {
00641 StringArray::const_iterator i=src.begin();
00642 for( ; i != src.end(); ++i)
00643 {
00644 int ret = ::pcre_exec(m_pcre, NULL, i->c_str(),
00645 i->length(), 0, eflags, NULL, 0);
00646 if( ret >= 0)
00647 {
00648 out.push_back(*i);
00649 }
00650 else if( ret != PCRE_ERROR_NOMATCH)
00651 {
00652 m_ecode = ret;
00653 m_error = getError(m_ecode);
00654 BLOCXX_THROW_ERR(RegExExecuteException,
00655 errorString().c_str(), m_ecode);
00656 }
00657 }
00658 }
00659 return out;
00660 }
00661
00662
00663
00664 bool
00665 PerlRegEx::match(const String &str, size_t index, int eflags) const
00666 {
00667 if( m_pcre == NULL)
00668 {
00669 BLOCXX_THROW(RegExCompileException,
00670 "Regular expression is not compiled");
00671 }
00672
00673 if( index > str.length())
00674 {
00675 BLOCXX_THROW(OutOfBoundsException,
00676 Format("String index out of bounds."
00677 "length = %1, index = %2",
00678 str.length(), index
00679 ).c_str());
00680 }
00681
00682 m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(),
00683 str.length(), 0, eflags, NULL, 0);
00684 if( m_ecode >= 0)
00685 {
00686 m_error.erase();
00687 return true;
00688 }
00689 else if( m_ecode == PCRE_ERROR_NOMATCH)
00690 {
00691 m_error = getError(m_ecode);
00692 return false;
00693 }
00694 else
00695 {
00696 m_error = getError(m_ecode);
00697 BLOCXX_THROW_ERR(RegExExecuteException,
00698 errorString().c_str(), m_ecode);
00699 }
00700 }
00701
00702
00703
00704 }
00705
00706 #endif // BLOCXX_HAVE_PCRE_H
00707 #endif // BLOCXX_HAVE_PCRE
00708
00709
00710