123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512 |
- .\" Start example.
- .de EX
- . nr mE \\n(.f
- . nf
- . nh
- . ft CW
- ..
- .
- .
- .\" End example.
- .de EE
- . ft \\n(mE
- . fi
- . hy \\n(HY
- ..
- .
- .EX
- /*************************************************
- * PCRE2 DEMONSTRATION PROGRAM *
- *************************************************/
- /* This is a demonstration program to illustrate a straightforward way of
- using the PCRE2 regular expression library from a C program. See the
- pcre2sample documentation for a short discussion ("man pcre2sample" if you have
- the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
- incompatible with the original PCRE API.
- There are actually three libraries, each supporting a different code unit
- width. This demonstration program uses the 8-bit library. The default is to
- process each code unit as a separate character, but if the pattern begins with
- "(*UTF)", both it and the subject are treated as UTF-8 strings, where
- characters may occupy multiple code units.
- In Unix-like environments, if PCRE2 is installed in your standard system
- libraries, you should be able to compile this program using this command:
- cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
- If PCRE2 is not installed in a standard place, it is likely to be installed
- with support for the pkg-config mechanism. If you have pkg-config, you can
- compile this program using this command:
- cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
- If you do not have pkg-config, you may have to use something like this:
- cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e
- -R/usr/local/lib -lpcre2-8 -o pcre2demo
- Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
- library files for PCRE2 are installed on your system. Only some operating
- systems (Solaris is one) use the -R option.
- Building under Windows:
- If you want to statically link this program against a non-dll .a file, you must
- define PCRE2_STATIC before including pcre2.h, so in this environment, uncomment
- the following line. */
- /* #define PCRE2_STATIC */
- /* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
- For a program that uses only one code unit width, setting it to 8, 16, or 32
- makes it possible to use generic function names such as pcre2_compile(). Note
- that just changing 8 to 16 (for example) is not sufficient to convert this
- program to process 16-bit characters. Even in a fully 16-bit environment, where
- string-handling functions such as strcmp() and printf() work with 16-bit
- characters, the code for handling the table of named substrings will still need
- to be modified. */
- #define PCRE2_CODE_UNIT_WIDTH 8
- #include <stdio.h>
- #include <string.h>
- #include <pcre2.h>
- /**************************************************************************
- * Here is the program. The API includes the concept of "contexts" for *
- * setting up unusual interface requirements for compiling and matching, *
- * such as custom memory managers and non-standard newline definitions. *
- * This program does not do any of this, so it makes no use of contexts, *
- * always passing NULL where a context could be given. *
- **************************************************************************/
- int main(int argc, char **argv)
- {
- pcre2_code *re;
- PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
- PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
- PCRE2_SPTR name_table;
- int crlf_is_newline;
- int errornumber;
- int find_all;
- int i;
- int rc;
- int utf8;
- uint32_t option_bits;
- uint32_t namecount;
- uint32_t name_entry_size;
- uint32_t newline;
- PCRE2_SIZE erroroffset;
- PCRE2_SIZE *ovector;
- PCRE2_SIZE subject_length;
- pcre2_match_data *match_data;
- /**************************************************************************
- * First, sort out the command line. There is only one possible option at *
- * the moment, "-g" to request repeated matching to find all occurrences, *
- * like Perl's /g option. We set the variable find_all to a non-zero value *
- * if the -g option is present. *
- **************************************************************************/
- find_all = 0;
- for (i = 1; i < argc; i++)
- {
- if (strcmp(argv[i], "-g") == 0) find_all = 1;
- else if (argv[i][0] == '-')
- {
- printf("Unrecognised option %s\en", argv[i]);
- return 1;
- }
- else break;
- }
- /* After the options, we require exactly two arguments, which are the pattern,
- and the subject string. */
- if (argc - i != 2)
- {
- printf("Exactly two arguments required: a regex and a subject string\en");
- return 1;
- }
- /* Pattern and subject are char arguments, so they can be straightforwardly
- cast to PCRE2_SPTR because we are working in 8-bit code units. The subject
- length is cast to PCRE2_SIZE for completeness, though PCRE2_SIZE is in fact
- defined to be size_t. */
- pattern = (PCRE2_SPTR)argv[i];
- subject = (PCRE2_SPTR)argv[i+1];
- subject_length = (PCRE2_SIZE)strlen((char *)subject);
- /*************************************************************************
- * Now we are going to compile the regular expression pattern, and handle *
- * any errors that are detected. *
- *************************************************************************/
- re = pcre2_compile(
- pattern, /* the pattern */
- PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */
- 0, /* default options */
- &errornumber, /* for error number */
- &erroroffset, /* for error offset */
- NULL); /* use default compile context */
- /* Compilation failed: print the error message and exit. */
- if (re == NULL)
- {
- PCRE2_UCHAR buffer[256];
- pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
- printf("PCRE2 compilation failed at offset %d: %s\en", (int)erroroffset,
- buffer);
- return 1;
- }
- /*************************************************************************
- * If the compilation succeeded, we call PCRE2 again, in order to do a *
- * pattern match against the subject string. This does just ONE match. If *
- * further matching is needed, it will be done below. Before running the *
- * match we must set up a match_data block for holding the result. Using *
- * pcre2_match_data_create_from_pattern() ensures that the block is *
- * exactly the right size for the number of capturing parentheses in the *
- * pattern. If you need to know the actual size of a match_data block as *
- * a number of bytes, you can find it like this: *
- * *
- * PCRE2_SIZE match_data_size = pcre2_get_match_data_size(match_data); *
- *************************************************************************/
- match_data = pcre2_match_data_create_from_pattern(re, NULL);
- /* Now run the match. */
- rc = pcre2_match(
- re, /* the compiled pattern */
- subject, /* the subject string */
- subject_length, /* the length of the subject */
- 0, /* start at offset 0 in the subject */
- 0, /* default options */
- match_data, /* block for storing the result */
- NULL); /* use default match context */
- /* Matching failed: handle error cases */
- if (rc < 0)
- {
- switch(rc)
- {
- case PCRE2_ERROR_NOMATCH: printf("No match\en"); break;
- /*
- Handle other special cases if you like
- */
- default: printf("Matching error %d\en", rc); break;
- }
- pcre2_match_data_free(match_data); /* Release memory used for the match */
- pcre2_code_free(re); /* data and the compiled pattern. */
- return 1;
- }
- /* Match succeded. Get a pointer to the output vector, where string offsets are
- stored. */
- ovector = pcre2_get_ovector_pointer(match_data);
- printf("Match succeeded at offset %d\en", (int)ovector[0]);
- /*************************************************************************
- * We have found the first match within the subject string. If the output *
- * vector wasn't big enough, say so. Then output any substrings that were *
- * captured. *
- *************************************************************************/
- /* The output vector wasn't big enough. This should not happen, because we used
- pcre2_match_data_create_from_pattern() above. */
- if (rc == 0)
- printf("ovector was not big enough for all the captured substrings\en");
- /* We must guard against patterns such as /(?=.\eK)/ that use \eK in an assertion
- to set the start of a match later than its end. In this demonstration program,
- we just detect this case and give up. */
- if (ovector[0] > ovector[1])
- {
- printf("\e\eK was used in an assertion to set the match start after its end.\en"
- "From end to start the match was: %.*s\en", (int)(ovector[0] - ovector[1]),
- (char *)(subject + ovector[1]));
- printf("Run abandoned\en");
- pcre2_match_data_free(match_data);
- pcre2_code_free(re);
- return 1;
- }
- /* Show substrings stored in the output vector by number. Obviously, in a real
- application you might want to do things other than print them. */
- for (i = 0; i < rc; i++)
- {
- PCRE2_SPTR substring_start = subject + ovector[2*i];
- PCRE2_SIZE substring_length = ovector[2*i+1] - ovector[2*i];
- printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
- }
- /**************************************************************************
- * That concludes the basic part of this demonstration program. We have *
- * compiled a pattern, and performed a single match. The code that follows *
- * shows first how to access named substrings, and then how to code for *
- * repeated matches on the same subject. *
- **************************************************************************/
- /* See if there are any named substrings, and if so, show them by name. First
- we have to extract the count of named parentheses from the pattern. */
- (void)pcre2_pattern_info(
- re, /* the compiled pattern */
- PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
- &namecount); /* where to put the answer */
- if (namecount == 0) printf("No named substrings\en"); else
- {
- PCRE2_SPTR tabptr;
- printf("Named substrings\en");
- /* Before we can access the substrings, we must extract the table for
- translating names to numbers, and the size of each entry in the table. */
- (void)pcre2_pattern_info(
- re, /* the compiled pattern */
- PCRE2_INFO_NAMETABLE, /* address of the table */
- &name_table); /* where to put the answer */
- (void)pcre2_pattern_info(
- re, /* the compiled pattern */
- PCRE2_INFO_NAMEENTRYSIZE, /* size of each entry in the table */
- &name_entry_size); /* where to put the answer */
- /* Now we can scan the table and, for each entry, print the number, the name,
- and the substring itself. In the 8-bit library the number is held in two
- bytes, most significant first. */
- tabptr = name_table;
- for (i = 0; i < namecount; i++)
- {
- int n = (tabptr[0] << 8) | tabptr[1];
- printf("(%d) %*s: %.*s\en", n, name_entry_size - 3, tabptr + 2,
- (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
- tabptr += name_entry_size;
- }
- }
- /*************************************************************************
- * If the "-g" option was given on the command line, we want to continue *
- * to search for additional matches in the subject string, in a similar *
- * way to the /g option in Perl. This turns out to be trickier than you *
- * might think because of the possibility of matching an empty string. *
- * What happens is as follows: *
- * *
- * If the previous match was NOT for an empty string, we can just start *
- * the next match at the end of the previous one. *
- * *
- * If the previous match WAS for an empty string, we can't do that, as it *
- * would lead to an infinite loop. Instead, a call of pcre2_match() is *
- * made with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set. The *
- * first of these tells PCRE2 that an empty string at the start of the *
- * subject is not a valid match; other possibilities must be tried. The *
- * second flag restricts PCRE2 to one match attempt at the initial string *
- * position. If this match succeeds, an alternative to the empty string *
- * match has been found, and we can print it and proceed round the loop, *
- * advancing by the length of whatever was found. If this match does not *
- * succeed, we still stay in the loop, advancing by just one character. *
- * In UTF-8 mode, which can be set by (*UTF) in the pattern, this may be *
- * more than one byte. *
- * *
- * However, there is a complication concerned with newlines. When the *
- * newline convention is such that CRLF is a valid newline, we must *
- * advance by two characters rather than one. The newline convention can *
- * be set in the regex by (*CR), etc.; if not, we must find the default. *
- *************************************************************************/
- if (!find_all) /* Check for -g */
- {
- pcre2_match_data_free(match_data); /* Release the memory that was used */
- pcre2_code_free(re); /* for the match data and the pattern. */
- return 0; /* Exit the program. */
- }
- /* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
- sequence. First, find the options with which the regex was compiled and extract
- the UTF state. */
- (void)pcre2_pattern_info(re, PCRE2_INFO_ALLOPTIONS, &option_bits);
- utf8 = (option_bits & PCRE2_UTF) != 0;
- /* Now find the newline convention and see whether CRLF is a valid newline
- sequence. */
- (void)pcre2_pattern_info(re, PCRE2_INFO_NEWLINE, &newline);
- crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
- newline == PCRE2_NEWLINE_CRLF ||
- newline == PCRE2_NEWLINE_ANYCRLF;
- /* Loop for second and subsequent matches */
- for (;;)
- {
- uint32_t options = 0; /* Normally no options */
- PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
- /* If the previous match was for an empty string, we are finished if we are
- at the end of the subject. Otherwise, arrange to run another match at the
- same point to see if a non-empty match can be found. */
- if (ovector[0] == ovector[1])
- {
- if (ovector[0] == subject_length) break;
- options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
- }
- /* If the previous match was not an empty string, there is one tricky case to
- consider. If a pattern contains \eK within a lookbehind assertion at the
- start, the end of the matched string can be at the offset where the match
- started. Without special action, this leads to a loop that keeps on matching
- the same substring. We must detect this case and arrange to move the start on
- by one character. The pcre2_get_startchar() function returns the starting
- offset that was passed to pcre2_match(). */
- else
- {
- PCRE2_SIZE startchar = pcre2_get_startchar(match_data);
- if (start_offset <= startchar)
- {
- if (startchar >= subject_length) break; /* Reached end of subject. */
- start_offset = startchar + 1; /* Advance by one character. */
- if (utf8) /* If UTF-8, it may be more */
- { /* than one code unit. */
- for (; start_offset < subject_length; start_offset++)
- if ((subject[start_offset] & 0xc0) != 0x80) break;
- }
- }
- }
- /* Run the next matching operation */
- rc = pcre2_match(
- re, /* the compiled pattern */
- subject, /* the subject string */
- subject_length, /* the length of the subject */
- start_offset, /* starting offset in the subject */
- options, /* options */
- match_data, /* block for storing the result */
- NULL); /* use default match context */
- /* This time, a result of NOMATCH isn't an error. If the value in "options"
- is zero, it just means we have found all possible matches, so the loop ends.
- Otherwise, it means we have failed to find a non-empty-string match at a
- point where there was a previous empty-string match. In this case, we do what
- Perl does: advance the matching position by one character, and continue. We
- do this by setting the "end of previous match" offset, because that is picked
- up at the top of the loop as the point at which to start again.
- There are two complications: (a) When CRLF is a valid newline sequence, and
- the current position is just before it, advance by an extra byte. (b)
- Otherwise we must ensure that we skip an entire UTF character if we are in
- UTF mode. */
- if (rc == PCRE2_ERROR_NOMATCH)
- {
- if (options == 0) break; /* All matches found */
- ovector[1] = start_offset + 1; /* Advance one code unit */
- if (crlf_is_newline && /* If CRLF is a newline & */
- start_offset < subject_length - 1 && /* we are at CRLF, */
- subject[start_offset] == '\er' &&
- subject[start_offset + 1] == '\en')
- ovector[1] += 1; /* Advance by one more. */
- else if (utf8) /* Otherwise, ensure we */
- { /* advance a whole UTF-8 */
- while (ovector[1] < subject_length) /* character. */
- {
- if ((subject[ovector[1]] & 0xc0) != 0x80) break;
- ovector[1] += 1;
- }
- }
- continue; /* Go round the loop again */
- }
- /* Other matching errors are not recoverable. */
- if (rc < 0)
- {
- printf("Matching error %d\en", rc);
- pcre2_match_data_free(match_data);
- pcre2_code_free(re);
- return 1;
- }
- /* Match succeded */
- printf("\enMatch succeeded again at offset %d\en", (int)ovector[0]);
- /* The match succeeded, but the output vector wasn't big enough. This
- should not happen. */
- if (rc == 0)
- printf("ovector was not big enough for all the captured substrings\en");
- /* We must guard against patterns such as /(?=.\eK)/ that use \eK in an
- assertion to set the start of a match later than its end. In this
- demonstration program, we just detect this case and give up. */
- if (ovector[0] > ovector[1])
- {
- printf("\e\eK was used in an assertion to set the match start after its end.\en"
- "From end to start the match was: %.*s\en", (int)(ovector[0] - ovector[1]),
- (char *)(subject + ovector[1]));
- printf("Run abandoned\en");
- pcre2_match_data_free(match_data);
- pcre2_code_free(re);
- return 1;
- }
- /* As before, show substrings stored in the output vector by number, and then
- also any named substrings. */
- for (i = 0; i < rc; i++)
- {
- PCRE2_SPTR substring_start = subject + ovector[2*i];
- size_t substring_length = ovector[2*i+1] - ovector[2*i];
- printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
- }
- if (namecount == 0) printf("No named substrings\en"); else
- {
- PCRE2_SPTR tabptr = name_table;
- printf("Named substrings\en");
- for (i = 0; i < namecount; i++)
- {
- int n = (tabptr[0] << 8) | tabptr[1];
- printf("(%d) %*s: %.*s\en", n, name_entry_size - 3, tabptr + 2,
- (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
- tabptr += name_entry_size;
- }
- }
- } /* End of loop to find second and subsequent matches */
- printf("\en");
- pcre2_match_data_free(match_data);
- pcre2_code_free(re);
- return 0;
- }
- /* End of pcre2demo.c */
- .EE
|