|
pcre++.ccGo to the documentation of this file.00001 /* 00002 * 00003 * $Id: pcre++.cc,v 1.2 2002/01/02 01:25:30 zarahg Exp $ 00004 * 00005 * This file is part of the PCRE++ Class Library. 00006 * 00007 * By accessing this software, PCRE++, you are duly informed 00008 * of and agree to be bound by the conditions described below 00009 * in this notice: 00010 * 00011 * This software product, PCRE++, is developed by Thomas Linden 00012 * and copyrighted (C) 2002 by Thomas Linden, with all rights 00013 * reserved. 00014 * 00015 * There is no charge for PCRE++ software. You can redistribute 00016 * it and/or modify it under the terms of the GNU Lesser General 00017 * Public License, which is incorporated by reference herein. 00018 * 00019 * PCRE++ is distributed WITHOUT ANY WARRANTY, IMPLIED OR EXPRESS, 00020 * OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE or that 00021 * the use of it will not infringe on any third party's intellec- 00022 * tual property rights. 00023 * 00024 * You should have received a copy of the GNU Lesser General Public 00025 * License along with PCRE++. Copies can also be obtained from: 00026 * 00027 * http://www.gnu.org/licenses/lgpl.txt 00028 * 00029 * or by writing to: 00030 * 00031 * Free Software Foundation, Inc. 00032 * 59 Temple Place, Suite 330 00033 * Boston, MA 02111-1307 00034 * USA 00035 * 00036 * Or contact: 00037 * 00038 * "Thomas Linden" <tom@daemon.de> 00039 * 00040 * 00041 */ 00042 00043 00044 #include "pcre++.h" 00045 00046 00047 /* 00048 * CONSTRUCTORS 00049 */ 00050 Pcre::Pcre(const string& expression) { 00051 _expression = expression; 00052 _flags = 0; 00053 case_t = global_t = false; 00054 zero(); 00055 Compile(0); 00056 } 00057 00058 Pcre::Pcre(const string& expression, const string& flags) { 00059 _expression = expression; 00060 unsigned int FLAG = 0; 00061 00062 for(unsigned int flag=0; flag<flags.length(); flag++) { 00063 switch(flags[flag]) { 00064 case 'i': FLAG |= PCRE_CASELESS; case_t = true; break; 00065 case 'm': FLAG |= PCRE_MULTILINE; break; 00066 case 's': FLAG |= PCRE_DOTALL; break; 00067 case 'x': FLAG |= PCRE_EXTENDED; break; 00068 case 'g': global_t = true; break; 00069 } 00070 } 00071 00072 _flags = FLAG; 00073 00074 zero(); 00075 Compile(FLAG); 00076 } 00077 00078 Pcre::Pcre(const Pcre &P) { 00079 _expression = P._expression; 00080 _flags = P._flags; 00081 case_t = P.case_t; 00082 global_t = P.global_t; 00083 zero(); 00084 Compile(_flags); 00085 } 00086 00087 Pcre::Pcre() { 00088 zero(); 00089 } 00090 00091 00092 00093 00094 00095 00096 00097 /* 00098 * Destructor 00099 */ 00100 Pcre::~Pcre() { 00101 /* avoid deleting of uninitialized pointers */ 00102 if (p_pcre != NULL) { 00103 pcre_free(p_pcre); 00104 } 00105 if (p_pcre_extra != NULL) { 00106 pcre_free(p_pcre_extra); 00107 } 00108 if(sub_vec != NULL) { 00109 delete[] sub_vec; 00110 } 00111 if(num_matches > 0) { 00112 delete resultset; 00113 } 00114 if(err_str != NULL) { 00115 delete err_str; 00116 } 00117 } 00118 00119 00120 00121 00122 /* 00123 * operator= definitions 00124 */ 00125 const Pcre& Pcre::operator = (const string& expression) { 00126 /* reset the object and re-intialize it */ 00127 reset(); 00128 _expression = expression; 00129 _flags = 0; 00130 case_t = global_t = false; 00131 Compile(0); 00132 return *this; 00133 } 00134 00135 00136 const Pcre& Pcre::operator = (const Pcre &P) { 00137 reset(); 00138 _expression = P._expression; 00139 _flags = P._flags; 00140 case_t = P.case_t; 00141 global_t = P.global_t; 00142 zero(); 00143 Compile(_flags); 00144 return *this; 00145 } 00146 00147 00148 00149 00150 00151 00152 /* 00153 * mem resetting methods 00154 */ 00155 void Pcre::zero() { 00156 /* what happens if p_pcre is already allocated? hm ... */ 00157 p_pcre_extra = NULL; 00158 p_pcre = NULL; 00159 sub_vec = NULL; 00160 resultset = NULL; 00161 err_str = NULL; 00162 num_matches = -1; 00163 } 00164 00165 void Pcre::reset() { 00166 did_match = false; 00167 num_matches = -1; 00168 } 00169 00170 00171 00172 00173 00174 /* 00175 * compile the expression 00176 */ 00177 void Pcre::Compile(int flags) { 00178 p_pcre = pcre_compile((char *)_expression.c_str(), flags, 00179 (const char **)(&err_str), &erroffset, NULL); 00180 00181 if(p_pcre == NULL) { 00182 /* umh, that's odd, the parser should not fail at all */ 00183 string Error = err_str; 00184 throw exception("pcre_compile(..) failed: " + Error); 00185 } 00186 00187 /* calculate the number of substrings we are willing to catch */ 00188 int where; 00189 int info = pcre_fullinfo( p_pcre, p_pcre_extra, PCRE_INFO_CAPTURECOUNT, &where); 00190 if(info == 0) { 00191 sub_len = (where +2) * 3; /* see "man pcre" for the exact formula */ 00192 } 00193 else { 00194 throw exception(info); 00195 } 00196 reset(); 00197 } 00198 00199 00200 00201 00202 /* 00203 * API methods 00204 */ 00205 bool Pcre::search(const string& stuff, int OffSet) { 00206 return dosearch(stuff, OffSet); 00207 } 00208 00209 bool Pcre::search(const string& stuff) { 00210 return dosearch(stuff, 0); 00211 } 00212 00213 bool Pcre::dosearch(const string& stuff, int OffSet) { 00214 reset(); 00215 sub_vec = new int[sub_len]; 00216 int num = pcre_exec(p_pcre, p_pcre_extra, (char *)stuff.c_str(), 00217 (int)stuff.length(), OffSet, 0, (int *)sub_vec, sub_len); 00218 00219 if(num < 0) { 00220 /* no match at all */ 00221 return false; 00222 } 00223 else if(num == 0) { 00224 /* vector too small, there were too many substrings in stuff */ 00225 return false; 00226 } 00227 else if(num == 1) { 00228 /* we had a match, but without substrings */ 00229 did_match = true; 00230 num_matches = 0; 00231 return true; 00232 } 00233 else if(num > 1) { 00234 /* we had matching substrings */ 00235 resultset = new Array; 00236 const char **stringlist; 00237 did_match = true; 00238 num_matches = num - 1; 00239 00240 int res = pcre_get_substring_list((char *)stuff.c_str(), sub_vec, num, &stringlist); 00241 if(res == 0) { 00242 for(int i=1; i<num; i++) { 00243 resultset->push_back(stringlist[i]); 00244 } 00245 pcre_free_substring_list(stringlist); 00246 } 00247 else { 00248 throw exception(res); 00249 } 00250 return true; 00251 } 00252 else { 00253 /* some other uncommon error occured */ 00254 return false; 00255 } 00256 } 00257 00258 Array* Pcre::get_sub_strings() { 00259 if(resultset != NULL) 00260 return resultset; 00261 else 00262 return NULL; 00263 } 00264 00265 string Pcre::get_match(int pos) { 00266 if(pos >= 0 && pos < num_matches) { 00267 ArrayIterator P = resultset->begin() + pos; 00268 return *P; 00269 } 00270 else { 00271 throw exception("out of range"); 00272 } 00273 } 00274 00275 int Pcre::get_match_start() { 00276 if (sub_vec) 00277 return sub_vec[0]; 00278 else 00279 return -1; 00280 } 00281 00282 int Pcre::get_match_end() { 00283 if (sub_vec) 00284 return sub_vec[1] - 1; 00285 else 00286 return -1; 00287 } 00288 00289 int Pcre::get_match_start(int pos) { 00290 if(pos >= 0 && pos <= num_matches) { 00291 /* 00292 * sub_vec[0] and [1] is the start/end of the entire string. 00293 */ 00294 return sub_vec[ (++pos) * 2 ]; 00295 } 00296 else { 00297 throw exception("out of range"); 00298 } 00299 } 00300 00301 int Pcre::get_match_end(int pos) { 00302 if(pos >= 0 && pos <= num_matches) { 00303 /* 00304 * the end offset of a subpattern points to 00305 * the first offset of the next substring, 00306 * therefore -1 00307 */ 00308 return sub_vec[ ((++pos) * 2) + 1 ] - 1; 00309 } 00310 else { 00311 throw exception("out of range"); 00312 } 00313 } 00314 00315 size_t Pcre::get_match_length(int pos) { 00316 if(pos >= 0 && pos < num_matches) { 00317 ArrayIterator P = resultset->begin() + pos; 00318 return P->length(); 00319 } 00320 else { 00321 throw exception("out of range"); 00322 } 00323 } 00324 00325 Array Pcre::_split(const string& piece, int limit, int start_offset, int end_offset) { 00326 Array Splitted; 00327 /* _expression will be used as delimiter */ 00328 if(_expression.length() == 1) { 00329 /* use the plain c++ way, ignore the pre-compiled p_pcre */ 00330 string buffer, _delimiter, _piece; 00331 char z; 00332 if(case_t) { 00333 z = toupper(_expression[0]); 00334 for(size_t pos=0; pos < piece.length(); pos++) { 00335 _piece += (char)toupper(piece[pos]); 00336 } 00337 } 00338 else { 00339 z = _expression[0]; 00340 _piece = piece; 00341 } 00342 for(size_t pos=0; pos<piece.length(); pos++) { 00343 if(_piece[pos] == z) { 00344 Splitted.push_back(buffer); 00345 buffer = ""; 00346 } 00347 else { 00348 buffer += piece[pos]; 00349 } 00350 } 00351 if(buffer != "") { 00352 Splitted.push_back(buffer); 00353 } 00354 } 00355 else { 00356 /* use the regex way */ 00357 if(_expression[0] != '(' && _expression[ _expression.length() ] != ')' ) { 00358 /* oh, oh - the pre-compiled expression does not contain brackets */ 00359 pcre_free(p_pcre); 00360 pcre_free(p_pcre_extra); 00361 00362 pcre *_p = NULL; 00363 pcre_extra *_e = NULL;; 00364 00365 p_pcre = _p; 00366 p_pcre_extra = _e; 00367 00368 _expression = "(" + _expression + ")"; 00369 Compile(_flags); 00370 } 00371 int num_pieces=0, pos=0, piece_end = 0, piece_start = 0; 00372 for(;;) { 00373 if(search(piece, pos) == true) { 00374 if(matches() > 0) { 00375 piece_end = get_match_start(0) - 1; 00376 piece_start = pos; 00377 pos = piece_end + 1 + get_match_length(0); 00378 string junk(piece, piece_start, (piece_end - piece_start)+1); 00379 num_pieces++; 00380 if( (limit != 0 && num_pieces < limit) || limit == 0) { 00381 if( (start_offset != 0 && num_pieces >= start_offset) || start_offset == 0) { 00382 if( (end_offset != 0 && num_pieces <= end_offset) || end_offset == 0) { 00383 /* we are within the allowed range, so just add the grab */ 00384 Splitted.push_back(junk); 00385 } 00386 } 00387 } 00388 } 00389 } 00390 else { 00391 /* the rest of the string, there are no more delimiters */ 00392 string junk(piece, pos, (piece.length() - pos)); 00393 num_pieces++; 00394 if( (limit != 0 && num_pieces < limit) || limit == 0) { 00395 if( (start_offset != 0 && num_pieces >= start_offset) || start_offset == 0) { 00396 if( (end_offset != 0 && num_pieces <= end_offset) || end_offset == 0) { 00397 /* we are within the allowed range, so just add the grab */ 00398 Splitted.push_back(junk); 00399 } 00400 } 00401 } 00402 break; 00403 } 00404 } // for() 00405 } // if(_expression.length() 00406 return Splitted; 00407 } 00408 00409 Array Pcre::split(const string& piece) { 00410 return _split(piece, 0, 0, 0); 00411 } 00412 00413 Array Pcre::split(const string& piece, int limit) { 00414 return _split(piece, limit, 0, 0); 00415 } 00416 00417 Array Pcre::split(const string& piece, int limit, int start_offset) { 00418 return _split(piece, limit, start_offset, 0); 00419 } 00420 00421 Array Pcre::split(const string& piece, int limit, int start_offset, int end_offset) { 00422 return _split(piece, limit, start_offset, end_offset); 00423 } 00424 00425 Array Pcre::split(const string& piece, vector<int> positions) { 00426 Array PreSplitted = _split(piece, 0, 0, 0); 00427 Array Splitted; 00428 for(vector<int>::iterator vecIt=positions.begin(); vecIt != positions.end(); ++vecIt) { 00429 Splitted.push_back(PreSplitted[*vecIt]); 00430 } 00431 return Splitted; 00432 } 00433 00434 00435 00436 string Pcre::replace(const string& piece, const string& with) { 00437 string Replaced(piece); 00438 00439 /* 00440 * very first job: look, if the expression already contains 00441 * braces, if yes, do not add braces, else, do it 00442 */ 00443 Pcre braces("[^\\\\]\\(.*[^\\\\]\\)"); // perlish: [^\\]\(.*[^\\]\) 00444 if(! braces.search(_expression)) { 00445 // if(_expression[0] != '(' && _expression[ _expression.length() ] != ')' ) { 00446 /* oh, oh - the pre-compiled expression does not contain brackets */ 00447 00448 /* recreate the p_pcre* objects to avoid memory leaks */ 00449 pcre_free(p_pcre); 00450 pcre_free(p_pcre_extra); 00451 00452 pcre *_p = NULL; 00453 pcre_extra *_e = NULL;; 00454 00455 p_pcre = _p; 00456 p_pcre_extra = _e; 00457 00458 _expression = "(" + _expression + ")"; 00459 Compile(_flags); 00460 } 00461 00462 if(search(piece)) { 00463 /* we found at least one match */ 00464 string use_with = _replace_vars(with); 00465 if(!global_t) { 00466 /* 00467 * only once, use the entire match 00468 * Patch submitted by Mark Carrington <mark@mutantpenguin.co.uk> 00469 */ 00470 if(matched() && matches() >= 1) { 00471 int len = get_match_end() - get_match_start() + 1; 00472 Replaced.replace(get_match_start(0), len, use_with); 00473 } 00474 } 00475 else { 00476 /* 00477 * global replace. 00478 * 00479 * We need to keep checking the line after it is modified to see the next match. 00480 * Especially \s is something of a bitch as it can be a newline, return carriage, 00481 * space, tab, etc ... so we have to keep searching for the next type. 00482 * Patch submitted by Jim Hull <imaginos@imaginos.net> 00483 */ 00484 string sLeftOver = Replaced; 00485 int iCurPosition = 0; 00486 while( search( sLeftOver ) ) { 00487 if( matched() && matches() >= 1 ) { 00488 int len = 0; 00489 string lookfor; 00490 lookfor.erase(); 00491 int match_pos; 00492 for (match_pos = 0; match_pos < matches(); match_pos++) { 00493 len += ((get_match_end(match_pos) - get_match_start(match_pos)) + 1); 00494 lookfor += get_match(match_pos); 00495 } 00496 match_pos = Replaced.find( lookfor, iCurPosition ); 00497 Replaced.replace(match_pos, len, use_with); 00498 iCurPosition = ( match_pos + use_with.length() ); 00499 sLeftOver = Replaced.substr( iCurPosition, string::npos ); 00500 } 00501 } 00502 } 00503 } 00504 return Replaced; 00505 } 00506 00507 00508 00509 string Pcre::_replace_vars(const string& piece) { 00510 Pcre dollar("\\$[0-9]+"); 00511 string with = piece; 00512 if(dollar.search(with)) { 00513 for(int index=0; index < num_matches; index++) { 00514 /* do it for each existing sub string */ 00515 string sub = get_match(index); // what "$1" resulted 00516 ostringstream num; 00517 num << index+1; 00518 string dollar_num = "(\\$" + num.str() + ")"; 00519 Pcre subsplit(dollar_num); // "\\$1" 00520 // normally 2 (or more) parts, the one in front of and the other one after "$1" 00521 Array splitted = subsplit.split(with); 00522 string Replaced; 00523 for(size_t pos=0; pos < splitted.size(); pos++) { 00524 if(pos == (splitted.size() - 1)) 00525 Replaced += splitted[pos]; 00526 else 00527 Replaced += splitted[pos] + sub; 00528 } 00529 with = Replaced; // well, one part is done 00530 } 00531 return with; 00532 } 00533 else { 00534 /* hm, no $[0-9]+ stuff, so just return it untouched */ 00535 return with; 00536 } 00537 } Generated on Tue Jul 16 22:14:38 2002 for PCRE++ by 1.2.13.1 written by Dimitri van Heesch, © 1997-2001 |