加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
re.c 14.70 KB
一键复制 编辑 原始数据 按行查看 历史
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631
/*
* re.c - compile regular expressions.
*/
/*
* Copyright (C) 1991-2015 the Free Software Foundation, Inc.
*
* This file is part of GAWK, the GNU implementation of the
* AWK Programming Language.
*
* GAWK is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* GAWK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*/
#include "awk.h"
static reg_syntax_t syn;
static void check_bracket_exp(char *s, size_t len);
const char *regexflags2str(int flags);
/* make_regexp --- generate compiled regular expressions */
Regexp *
make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
{
static char metas[] = ".*+(){}[]|?^$\\";
Regexp *rp;
const char *rerr;
const char *src = s;
static char *buf = NULL;
static size_t buflen;
const char *end = s + len;
char *dest;
int c, c2;
static bool first = true;
static bool no_dfa = false;
bool has_anchor = false;
reg_syntax_t dfa_syn;
int i;
/*
* The number of bytes in the current multibyte character.
* It is 0, when the current character is a singlebyte character.
*/
size_t is_multibyte = 0;
mbstate_t mbs;
memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize. */
if (first) {
first = false;
/* for debugging and testing */
no_dfa = (getenv("GAWK_NO_DFA") != NULL);
}
/* always check */
check_bracket_exp((char *) s, len);
/* Handle escaped characters first. */
/*
* Build a copy of the string (in buf) with the
* escaped characters translated, and generate the regex
* from that.
*/
if (buf == NULL) {
emalloc(buf, char *, len + 1, "make_regexp");
buflen = len;
} else if (len > buflen) {
erealloc(buf, char *, len + 1, "make_regexp");
buflen = len;
}
dest = buf;
while (src < end) {
if (gawk_mb_cur_max > 1 && ! is_multibyte) {
/* The previous byte is a singlebyte character, or last byte
of a multibyte character. We check the next character. */
is_multibyte = mbrlen(src, end - src, &mbs);
if ( is_multibyte == 1
|| is_multibyte == (size_t) -1
|| is_multibyte == (size_t) -2
|| is_multibyte == 0) {
/* We treat it as a single-byte character. */
is_multibyte = 0;
}
}
/* We skip multibyte character, since it must not be a special
character. */
if ((gawk_mb_cur_max == 1 || ! is_multibyte) &&
(*src == '\\')) {
c = *++src;
switch (c) {
case 'a':
case 'b':
case 'f':
case 'n':
case 'r':
case 't':
case 'v':
case 'x':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
c2 = parse_escape(&src);
if (c2 < 0)
cant_happen();
/*
* Unix awk treats octal (and hex?) chars
* literally in re's, so escape regexp
* metacharacters.
*/
if (do_traditional
&& ! do_posix
&& (isdigit(c) || c == 'x')
&& strchr("()|*+?.^$\\[]", c2) != NULL)
*dest++ = '\\';
*dest++ = (char) c2;
break;
case '8':
case '9': /* a\9b not valid */
*dest++ = c;
src++;
break;
case 'y': /* normally \b */
/* gnu regex op */
if (! do_traditional) {
*dest++ = '\\';
*dest++ = 'b';
src++;
break;
}
/* else, fall through */
default:
*dest++ = '\\';
*dest++ = (char) c;
src++;
break;
} /* switch */
} else {
c = *src;
if (c == '^' || c == '$')
has_anchor = true;
*dest++ = *src++; /* not '\\' */
}
if (gawk_mb_cur_max > 1 && is_multibyte)
is_multibyte--;
} /* while */
*dest = '\0';
len = dest - buf;
emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
memset((char *) rp, 0, sizeof(*rp));
rp->dfareg = NULL;
rp->pat.allocated = 0; /* regex will allocate the buffer */
emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
/*
* Lo these many years ago, had I known what a P.I.T.A. IGNORECASE
* was going to turn out to be, I wouldn't have bothered with it.
*
* In the case where we have a multibyte character set, we have no
* choice but to use RE_ICASE, since the casetable is for single-byte
* character sets only.
*
* On the other hand, if we do have a single-byte character set,
* using the casetable should give a performance improvement, since
* it's computed only once, not each time a regex is compiled. We
* also think it's probably better for portability. See the
* discussion by the definition of casetable[] in eval.c.
*/
ignorecase = !! ignorecase; /* force to 1 or 0 */
if (ignorecase) {
if (gawk_mb_cur_max > 1) {
syn |= RE_ICASE;
rp->pat.translate = NULL;
} else {
syn &= ~RE_ICASE;
rp->pat.translate = (RE_TRANSLATE_TYPE) casetable;
}
} else {
rp->pat.translate = NULL;
syn &= ~RE_ICASE;
}
dfa_syn = syn;
if (ignorecase)
dfa_syn |= RE_ICASE;
dfasyntax(dfa_syn, ignorecase, '\n');
re_set_syntax(syn);
if ((rerr = re_compile_pattern(buf, len, &(rp->pat))) != NULL) {
refree(rp);
if (! canfatal) {
/* rerr already gettextized inside regex routines */
error("%s: /%s/", rerr, buf);
return NULL;
}
fatal("%s: /%s/", rerr, buf);
}
/* gack. this must be done *after* re_compile_pattern */
rp->pat.newline_anchor = false; /* don't get \n in middle of string */
if (dfa && ! no_dfa) {
rp->dfa = true;
rp->dfareg = dfaalloc();
dfacomp(buf, len, rp->dfareg, true);
} else
rp->dfa = false;
rp->has_anchor = has_anchor;
/* Additional flags that help with RS as regexp. */
for (i = 0; i < len; i++) {
if (strchr(metas, buf[i]) != NULL) {
rp->has_meta = true;
break;
}
}
for (i = len - 1; i >= 0; i--) {
if (strchr("*+|?", buf[i]) != NULL) {
rp->maybe_long = true;
break;
}
}
return rp;
}
/* research --- do a regexp search. use dfa if possible */
int
research(Regexp *rp, char *str, int start,
size_t len, int flags)
{
const char *ret = str;
int try_backref = false;
int need_start;
int no_bol;
int res;
need_start = ((flags & RE_NEED_START) != 0);
no_bol = ((flags & RE_NO_BOL) != 0);
if (no_bol)
rp->pat.not_bol = 1;
/*
* Always do dfa search if can; if it fails, then even if
* need_start is true, we won't bother with the regex search.
*
* The dfa matcher doesn't have a no_bol flag, so don't bother
* trying it in that case.
*
* 7/2008: Skip the dfa matcher if need_start. The dfa matcher
* has bugs in certain multibyte cases and it's too difficult
* to try to special case things.
*/
if (rp->dfa && ! no_bol && ! need_start) {
char save;
size_t count = 0;
struct dfa *superset = dfasuperset(rp->dfareg);
/*
* dfa likes to stick a '\n' right after the matched
* text. So we just save and restore the character.
*/
save = str[start+len];
if (superset)
ret = dfaexec(superset, str+start, str+start+len,
true, NULL, NULL);
if (ret)
ret = dfaexec(rp->dfareg, str+start, str+start+len,
true, &count, &try_backref);
str[start+len] = save;
}
if (ret) {
if (need_start || rp->dfa == false || try_backref) {
/*
* Passing NULL as last arg speeds up search for cases
* where we don't need the start/end info.
*/
res = re_search(&(rp->pat), str, start+len,
start, len, need_start ? &(rp->regs) : NULL);
} else
res = 1;
} else
res = -1;
rp->pat.not_bol = 0;
return res;
}
/* refree --- free up the dynamic memory used by a compiled regexp */
void
refree(Regexp *rp)
{
if (rp == NULL)
return;
rp->pat.translate = NULL;
regfree(& rp->pat);
if (rp->regs.start)
free(rp->regs.start);
if (rp->regs.end)
free(rp->regs.end);
if (rp->dfa) {
dfafree(rp->dfareg);
free(rp->dfareg);
}
efree(rp);
}
/* dfaerror --- print an error message for the dfa routines */
void
dfaerror(const char *s)
{
fatal("%s", s);
exit(EXIT_FATAL); /* for DJGPP */
}
/* re_update --- recompile a dynamic regexp */
Regexp *
re_update(NODE *t)
{
NODE *t1;
if ((t->re_flags & CASE) == IGNORECASE) {
/* regex was compiled with settings matching IGNORECASE */
if ((t->re_flags & CONSTANT) != 0) {
/* it's a constant, so just return it as is */
assert(t->type == Node_regex || t->type == Node_typedregex);
return t->re_reg;
}
t1 = t->re_exp;
if (t1->type == Node_typedregex) {
assert((t1->re_flags & CONSTANT) != 0);
return t1->re_reg;
}
if (t->re_text != NULL) {
/* if contents haven't changed, just return it */
if (cmp_nodes(t->re_text, t1) == 0)
return t->re_reg;
/* things changed, fall through to recompile */
unref(t->re_text);
}
/* get fresh copy of the text of the regexp */
t->re_text = dupnode(t1);
}
/* was compiled with different IGNORECASE or text changed */
/* free old */
if (t->re_reg != NULL)
refree(t->re_reg);
if (t->re_cnt > 0)
t->re_cnt++;
if (t->re_cnt > 10)
t->re_cnt = 0;
if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) {
/* reset regexp text if needed */
t1 = t->re_exp;
unref(t->re_text);
t->re_text = dupnode(t1);
}
/* compile it */
t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,
IGNORECASE, t->re_cnt, true);
/* clear case flag */
t->re_flags &= ~CASE;
/* set current value of case flag */
t->re_flags |= IGNORECASE;
return t->re_reg;
}
/* resetup --- choose what kind of regexps we match */
void
resetup()
{
/*
* Syntax bits: _that_ is yet another mind trip. Recreational drugs
* are helpful for recovering from the experience.
*
* Aharon Robbins <arnold@skeeve.com>
* Sun, 21 Oct 2007 23:55:33 +0200
*/
if (do_posix)
syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */
else if (do_traditional)
syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */
else
syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */
/*
* Interval expressions are now on by default, as POSIX is
* wide-spread enough that people want it. The do_intervals
* variable remains for use with --traditional.
*/
if (do_intervals)
syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES;
(void) re_set_syntax(syn);
dfasyntax(syn, false, '\n');
}
/* avoid_dfa --- return true if we should not use the DFA matcher */
int
avoid_dfa(NODE *re, char *str, size_t len)
{
char *end;
/*
* f = @/.../
* if ("foo" ~ f) ...
*
* This creates a Node_dynregex with NULL re_reg.
*/
if (re->re_reg == NULL)
return false;
if (! re->re_reg->has_anchor)
return false;
for (end = str + len; str < end; str++)
if (*str == '\n')
return true;
return false;
}
/* reisstring --- return true if the RE match is a simple string match */
int
reisstring(const char *text, size_t len, Regexp *re, const char *buf)
{
int res;
const char *matched;
/* simple checking for meta characters in re */
if (re->has_meta)
return false; /* give up early, can't be string match */
/* make accessable to gdb */
matched = &buf[RESTART(re, buf)];
res = (memcmp(text, matched, len) == 0);
return res;
}
/* reflags2str --- make a regex flags value readable */
const char *
reflags2str(int flagval)
{
static const struct flagtab values[] = {
{ RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
{ RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
{ RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
{ RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
{ RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
{ RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
{ RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
{ RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
{ RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
{ RE_INTERVALS, "RE_INTERVALS" },
{ RE_LIMITED_OPS, "RE_LIMITED_OPS" },
{ RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
{ RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
{ RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
{ RE_NO_BK_REFS, "RE_NO_BK_REFS" },
{ RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
{ RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
{ RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
{ RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
{ RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
{ RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
{ RE_ICASE, "RE_ICASE" },
{ RE_CARET_ANCHORS_HERE, "RE_CARET_ANCHORS_HERE" },
{ RE_CONTEXT_INVALID_DUP, "RE_CONTEXT_INVALID_DUP" },
{ RE_NO_SUB, "RE_NO_SUB" },
{ 0, NULL },
};
if (flagval == RE_SYNTAX_EMACS) /* == 0 */
return "RE_SYNTAX_EMACS";
return genflags2str(flagval, values);
}
/*
* dfawarn() is called by the dfa routines whenever a regex is compiled
* must supply a dfawarn.
*/
void
dfawarn(const char *dfa_warning)
{
/*
* This routine does nothing, since gawk does its own
* (better) check for bad [[:foo:]] syntax.
*/
}
/* check_bracket_exp --- look for /[:space:]/ that should be /[[:space:]]/ */
static void
check_bracket_exp(char *s, size_t length)
{
static struct reclass {
const char *name;
size_t len;
bool warned;
} classes[] = {
/*
* Ordered by what we hope is frequency,
* since it's linear searched.
*/
{ "[:alpha:]", 9, false },
{ "[:digit:]", 9, false },
{ "[:alnum:]", 9, false },
{ "[:upper:]", 9, false },
{ "[:lower:]", 9, false },
{ "[:space:]", 9, false },
{ "[:xdigit:]", 10, false },
{ "[:punct:]", 9, false },
{ "[:print:]", 9, false },
{ "[:graph:]", 9, false },
{ "[:cntrl:]", 9, false },
{ "[:blank:]", 9, false },
{ NULL, 0 }
};
int i;
bool found = false;
char save;
char *sp, *sp2, *end;
int len;
int count = 0;
if (length == 0)
return;
end = s + length;
save = s[length];
s[length] = '\0';
sp = s;
again:
sp = sp2 = memchr(sp, '[', (end - sp));
if (sp == NULL)
goto done;
for (count++, sp++; *sp != '\0'; sp++) {
if (*sp == '[')
count++;
/*
* ] as first char after open [ is skipped
* \] is skipped
* [^]] is skipped
*/
if (*sp == ']' && sp > sp2) {
if (sp[-1] != '['
&& sp[-1] != '\\')
;
else if ((sp - sp2) >= 2
&& sp[-1] == '^' && sp[-2] == '[')
;
else
count--;
}
if (count == 0) {
sp++; /* skip past ']' */
break;
}
}
if (count > 0) { /* bad regex, give up */
goto done;
}
/* sp2 has start */
for (i = 0; classes[i].name != NULL; i++) {
if (classes[i].warned)
continue;
len = classes[i].len;
if ( len == (sp - sp2)
&& memcmp(sp2, classes[i].name, len) == 0) {
found = true;
break;
}
}
if (found && ! classes[i].warned) {
warning(_("regexp component `%.*s' should probably be `[%.*s]'"),
len, sp2, len, sp2);
classes[i].warned = true;
}
if (sp < end) {
found = false;
goto again;
}
done:
s[length] = save;
}
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化