tregex.c - neatvi - [fork] simple vi-type editor with UTF-8 support
git clone git://src.adamsgaard.dk/neatvi
Log
Files
Refs
README
---
tregex.c (13827B)
---
     1 #include 
     2 #include 
     3 #include 
     4 #include 
     5 #include "regex.h"
     6 
     7 #define NGRPS                64        /* maximum number of groups */
     8 #define NREPS                128        /* maximum repetitions */
     9 #define NDEPT                256        /* re_rec() recursion depth limit */
    10 
    11 #define MAX(a, b)        ((a) < (b) ? (b) : (a))
    12 #define LEN(a)                (sizeof(a) / sizeof((a)[0]))
    13 
    14 /* regular expressions atoms */
    15 #define RA_CHR                '\0'        /* character literal */
    16 #define RA_BEG                '^'        /* string start */
    17 #define RA_END                '$'        /* string end */
    18 #define RA_ANY                '.'        /* any character */
    19 #define RA_BRK                '['        /* bracket expression */
    20 #define RA_WBEG                '<'        /* word start */
    21 #define RA_WEND                '>'        /* word end */
    22 
    23 /* regular expression node types */
    24 #define RN_ATOM                '\0'        /* regular expression */
    25 #define RN_CAT                'c'        /* concatenation */
    26 #define RN_ALT                '|'        /* alternate expressions */
    27 #define RN_GRP                '('        /* pattern group */
    28 
    29 /* regular expression program instructions */
    30 #define RI_ATOM                '\0'        /* regular expression */
    31 #define RI_FORK                'f'        /* fork the execution */
    32 #define RI_JUMP                'j'        /* jump to the given instruction */
    33 #define RI_MARK                'm'        /* mark the current position */
    34 #define RI_MATCH        'q'        /* the pattern is matched */
    35 
    36 /* regular expression atom */
    37 struct ratom {
    38         int ra;                        /* atom type (RA_*) */
    39         char *s;                /* atom argument */
    40 };
    41 
    42 /* regular expression instruction */
    43 struct rinst {
    44         struct ratom ra;        /* regular expression atom (RI_ATOM) */
    45         int ri;                        /* instruction type (RI_*) */
    46         int a1, a2;                /* destination of RI_FORK and RI_JUMP */
    47         int mark;                /* mark (RI_MARK) */
    48 };
    49 
    50 /* regular expression program */
    51 struct regex {
    52         struct rinst *p;        /* the program */
    53         int n;                        /* number of instructions */
    54         int flg;                /* regcomp() flags */
    55 };
    56 
    57 /* regular expression matching state */
    58 struct rstate {
    59         char *s;                /* the current position in the string */
    60         char *o;                /* the beginning of the string */
    61         int mark[NGRPS * 2];        /* marks for RI_MARK */
    62         int pc;                        /* program counter */
    63         int flg;                /* flags passed to regcomp() and regexec() */
    64         int dep;                /* re_rec() depth */
    65 };
    66 
    67 /* regular expression tree; used for parsing */
    68 struct rnode {
    69         struct ratom ra;        /* regular expression atom (RN_ATOM) */
    70         struct rnode *c1, *c2;        /* children */
    71         int mincnt, maxcnt;        /* number of repetitions */
    72         int grp;                /* group number */
    73         int rn;                        /* node type (RN_*) */
    74 };
    75 
    76 static struct rnode *rnode_make(int rn, struct rnode *c1, struct rnode *c2)
    77 {
    78         struct rnode *rnode = malloc(sizeof(*rnode));
    79         memset(rnode, 0, sizeof(*rnode));
    80         rnode->rn = rn;
    81         rnode->c1 = c1;
    82         rnode->c2 = c2;
    83         rnode->mincnt = 1;
    84         rnode->maxcnt = 1;
    85         return rnode;
    86 }
    87 
    88 static void rnode_free(struct rnode *rnode)
    89 {
    90         if (rnode->c1)
    91                 rnode_free(rnode->c1);
    92         if (rnode->c2)
    93                 rnode_free(rnode->c2);
    94         free(rnode->ra.s);
    95         free(rnode);
    96 }
    97 
    98 static int uc_len(char *s)
    99 {
   100         int c = (unsigned char) s[0];
   101         if (~c & 0xc0)                /* ASCII or invalid */
   102                 return c > 0;
   103         if (~c & 0x20)
   104                 return 2;
   105         if (~c & 0x10)
   106                 return 3;
   107         if (~c & 0x08)
   108                 return 4;
   109         return 1;
   110 }
   111 
   112 static int uc_dec(char *s)
   113 {
   114         int c = (unsigned char) s[0];
   115         if (~c & 0xc0)                /* ASCII or invalid */
   116                 return c;
   117         if (~c & 0x20)
   118                 return ((c & 0x1f) << 6) | (s[1] & 0x3f);
   119         if (~c & 0x10)
   120                 return ((c & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
   121         if (~c & 0x08)
   122                 return ((c & 0x07) << 18) | ((s[1] & 0x3f) << 12) | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
   123         return c;
   124 }
   125 
   126 static void ratom_copy(struct ratom *dst, struct ratom *src)
   127 {
   128         dst->ra = src->ra;
   129         dst->s = NULL;
   130         if (src->s) {
   131                 int len = strlen(src->s);
   132                 dst->s = malloc(len + 1);
   133                 memcpy(dst->s, src->s, len + 1);
   134         }
   135 }
   136 
   137 static int brk_len(char *s)
   138 {
   139         int n = 1;
   140         if (s[n] == '^')        /* exclusion mark */
   141                 n++;
   142         if (s[n] == ']')        /* handling []a] */
   143                 n++;
   144         while (s[n] && s[n] != ']') {
   145                 if (s[n] == '[' && (s[n + 1] == ':' || s[n + 1] == '='))
   146                         while (s[n] && s[n] != ']')
   147                                 n++;
   148                 if (s[n])
   149                         n++;
   150         }
   151         return s[n] == ']' ? n + 1 : n;
   152 }
   153 
   154 static void ratom_readbrk(struct ratom *ra, char **pat)
   155 {
   156         int len = brk_len(*pat);
   157         ra->ra = RA_BRK;
   158         ra->s = malloc(len + 1);
   159         memcpy(ra->s, *pat, len);
   160         ra->s[len] = '\0';
   161         *pat += len;
   162 }
   163 
   164 static void ratom_read(struct ratom *ra, char **pat)
   165 {
   166         int len;
   167         switch ((unsigned char) **pat) {
   168         case '.':
   169                 ra->ra = RA_ANY;
   170                 (*pat)++;
   171                 break;
   172         case '^':
   173                 ra->ra = RA_BEG;
   174                 (*pat)++;
   175                 break;
   176         case '$':
   177                 ra->ra = RA_END;
   178                 (*pat)++;
   179                 break;
   180         case '[':
   181                 ratom_readbrk(ra, pat);
   182                 break;
   183         case '\\':
   184                 if ((*pat)[1] == '<' || (*pat)[1] == '>') {
   185                         ra->ra = (*pat)[1] == '<' ? RA_WBEG : RA_WEND;
   186                         *pat += 2;
   187                         break;
   188                 }
   189                 (*pat)++;
   190         default:
   191                 ra->ra = RA_CHR;
   192                 len = uc_len(*pat);
   193                 ra->s = malloc(8);
   194                 memcpy(ra->s, *pat, len);
   195                 ra->s[len] = '\0';
   196                 *pat += len;
   197         }
   198 }
   199 
   200 static char *uc_beg(char *beg, char *s)
   201 {
   202         while (s > beg && (((unsigned char) *s) & 0xc0) == 0x80)
   203                 s--;
   204         return s;
   205 }
   206 
   207 static int isword(char *s)
   208 {
   209         int c = (unsigned char) s[0];
   210         return isalnum(c) || c == '_' || c > 127;
   211 }
   212 
   213 static char *brk_classes[][2] = {
   214         {":alnum:", "a-zA-Z0-9"},
   215         {":alpha:", "a-zA-Z"},
   216         {":blank:", " \t"},
   217         {":digit:", "0-9"},
   218         {":lower:", "a-z"},
   219         {":print:", "\x20-\x7e"},
   220         {":punct:", "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"},
   221         {":space:", " \t\r\n\v\f"},
   222         {":upper:", "A-Z"},
   223         {":word:", "a-zA-Z0-9_"},
   224         {":xdigit:", "a-fA-F0-9"},
   225 };
   226 
   227 static int brk_match(char *brk, int c, int flg)
   228 {
   229         int beg, end;
   230         int i;
   231         int not = brk[0] == '^';
   232         char *p = not ? brk + 1 : brk;
   233         char *p0 = p;
   234         if (flg & REG_ICASE && c < 128 && isupper(c))
   235                 c = tolower(c);
   236         while (*p && (p == p0 || *p != ']')) {
   237                 if (p[0] == '[' && p[1] == ':') {
   238                         for (i = 0; i < LEN(brk_classes); i++) {
   239                                 char *cc = brk_classes[i][0];
   240                                 char *cp = brk_classes[i][1];
   241                                 if (!strncmp(cc, p + 1, strlen(cc)))
   242                                         if (!brk_match(cp, c, flg))
   243                                                 return not;
   244                         }
   245                         p += brk_len(p);
   246                         continue;
   247                 }
   248                 beg = uc_dec(p);
   249                 p += uc_len(p);
   250                 end = beg;
   251                 if (p[0] == '-' && p[1] && p[1] != ']') {
   252                         p++;
   253                         end = uc_dec(p);
   254                         p += uc_len(p);
   255                 }
   256                 if (flg & REG_ICASE && beg < 128 && isupper(beg))
   257                         beg = tolower(beg);
   258                 if (flg & REG_ICASE && end < 128 && isupper(end))
   259                         end = tolower(end);
   260                 if (c >= beg && c <= end)
   261                         return not;
   262         }
   263         return !not;
   264 }
   265 
   266 static int ratom_match(struct ratom *ra, struct rstate *rs)
   267 {
   268         if (ra->ra == RA_CHR) {
   269                 int c1 = uc_dec(ra->s);
   270                 int c2 = uc_dec(rs->s);
   271                 if (rs->flg & REG_ICASE && c1 < 128 && isupper(c1))
   272                         c1 = tolower(c1);
   273                 if (rs->flg & REG_ICASE && c2 < 128 && isupper(c2))
   274                         c2 = tolower(c2);
   275                 if (c1 != c2)
   276                         return 1;
   277                 rs->s += uc_len(ra->s);
   278                 return 0;
   279         }
   280         if (ra->ra == RA_ANY) {
   281                 if (!rs->s[0] || (rs->s[0] == '\n' && !(rs->flg & REG_NOTEOL)))
   282                         return 1;
   283                 rs->s += uc_len(rs->s);
   284                 return 0;
   285         }
   286         if (ra->ra == RA_BRK) {
   287                 int c = uc_dec(rs->s);
   288                 if (!c || (c == '\n' && !(rs->flg & REG_NOTEOL)))
   289                         return 1;
   290                 rs->s += uc_len(rs->s);
   291                 return brk_match(ra->s + 1, c, rs->flg);
   292         }
   293         if (ra->ra == RA_BEG && !(rs->flg & REG_NOTBOL))
   294                 return !(rs->s == rs->o || rs->s[-1] == '\n');
   295         if (ra->ra == RA_END && !(rs->flg & REG_NOTEOL))
   296                 return rs->s[0] != '\0' && rs->s[0] != '\n';
   297         if (ra->ra == RA_WBEG)
   298                 return !((rs->s == rs->o || !isword(uc_beg(rs->o, rs->s - 1))) &&
   299                         isword(rs->s));
   300         if (ra->ra == RA_WEND)
   301                 return !(rs->s != rs->o && isword(uc_beg(rs->o, rs->s - 1)) &&
   302                         (!rs->s[0] || !isword(rs->s)));
   303         return 1;
   304 }
   305 
   306 static struct rnode *rnode_parse(char **pat);
   307 
   308 static struct rnode *rnode_grp(char **pat)
   309 {
   310         struct rnode *rnode = NULL;
   311         if ((*pat)[0] != '(')
   312                 return NULL;
   313         *pat += 1;
   314         if ((*pat)[0] != ')') {
   315                 rnode = rnode_parse(pat);
   316                 if (!rnode)
   317                         return NULL;
   318         }
   319         if ((*pat)[0] != ')') {
   320                 rnode_free(rnode);
   321                 return NULL;
   322         }
   323         *pat += 1;
   324         return rnode_make(RN_GRP, rnode, NULL);
   325 }
   326 
   327 static struct rnode *rnode_atom(char **pat)
   328 {
   329         struct rnode *rnode;
   330         if (!**pat)
   331                 return NULL;
   332         if ((*pat)[0] == '|' || (*pat)[0] == ')')
   333                 return NULL;
   334         if ((*pat)[0] == '(') {
   335                 rnode = rnode_grp(pat);
   336         } else {
   337                 rnode = rnode_make(RN_ATOM, NULL, NULL);
   338                 ratom_read(&rnode->ra, pat);
   339         }
   340         if (!rnode)
   341                 return NULL;
   342         if ((*pat)[0] == '*' || (*pat)[0] == '?') {
   343                 rnode->mincnt = 0;
   344                 rnode->maxcnt = (*pat)[0] == '*' ? -1 : 1;
   345                 *pat += 1;
   346         }
   347         if ((*pat)[0] == '+') {
   348                 rnode->mincnt = 1;
   349                 rnode->maxcnt = -1;
   350                 *pat += 1;
   351         }
   352         if ((*pat)[0] == '{') {
   353                 rnode->mincnt = 0;
   354                 rnode->maxcnt = 0;
   355                 *pat += 1;
   356                 while (isdigit((unsigned char) **pat))
   357                         rnode->mincnt = rnode->mincnt * 10 + *(*pat)++ - '0';
   358                 if (**pat == ',') {
   359                         (*pat)++;
   360                         if ((*pat)[0] == '}')
   361                                 rnode->maxcnt = -1;
   362                         while (isdigit((unsigned char) **pat))
   363                                 rnode->maxcnt = rnode->maxcnt * 10 + *(*pat)++ - '0';
   364                 } else {
   365                         rnode->maxcnt = rnode->mincnt;
   366                 }
   367                 *pat += 1;
   368                 if (rnode->mincnt > NREPS || rnode->maxcnt > NREPS) {
   369                         rnode_free(rnode);
   370                         return NULL;
   371                 }
   372         }
   373         return rnode;
   374 }
   375 
   376 static struct rnode *rnode_seq(char **pat)
   377 {
   378         struct rnode *c1 = rnode_atom(pat);
   379         struct rnode *c2;
   380         if (!c1)
   381                 return NULL;
   382         c2 = rnode_seq(pat);
   383         return c2 ? rnode_make(RN_CAT, c1, c2) : c1;
   384 }
   385 
   386 static struct rnode *rnode_parse(char **pat)
   387 {
   388         struct rnode *c1 = rnode_seq(pat);
   389         struct rnode *c2;
   390         if ((*pat)[0] != '|')
   391                 return c1;
   392         *pat += 1;
   393         c2 = rnode_parse(pat);
   394         return c2 ? rnode_make(RN_ALT, c1, c2) : c1;
   395 }
   396 
   397 static int rnode_count(struct rnode *rnode)
   398 {
   399         int n = 1;
   400         if (!rnode)
   401                 return 0;
   402         if (rnode->rn == RN_CAT)
   403                 n = rnode_count(rnode->c1) + rnode_count(rnode->c2);
   404         if (rnode->rn == RN_ALT)
   405                 n = rnode_count(rnode->c1) + rnode_count(rnode->c2) + 2;
   406         if (rnode->rn == RN_GRP)
   407                 n = rnode_count(rnode->c1) + 2;
   408         if (rnode->mincnt == 0 && rnode->maxcnt == 0)
   409                 return 0;
   410         if (rnode->mincnt == 1 && rnode->maxcnt == 1)
   411                 return n;
   412         if (rnode->maxcnt < 0) {
   413                 n = (rnode->mincnt + 1) * n + 1;
   414         } else {
   415                 n = (rnode->mincnt + rnode->maxcnt) * n +
   416                         rnode->maxcnt - rnode->mincnt;
   417         }
   418         if (!rnode->mincnt)
   419                 n++;
   420         return n;
   421 }
   422 
   423 static int rnode_grpnum(struct rnode *rnode, int num)
   424 {
   425         int cur = 0;
   426         if (!rnode)
   427                 return 0;
   428         if (rnode->rn == RN_GRP)
   429                 rnode->grp = num + cur++;
   430         cur += rnode_grpnum(rnode->c1, num + cur);
   431         cur += rnode_grpnum(rnode->c2, num + cur);
   432         return cur;
   433 }
   434 
   435 static int re_insert(struct regex *p, int ri)
   436 {
   437         p->p[p->n++].ri = ri;
   438         return p->n - 1;
   439 }
   440 
   441 static void rnode_emit(struct rnode *n, struct regex *p);
   442 
   443 static void rnode_emitnorep(struct rnode *n, struct regex *p)
   444 {
   445         int fork, done, mark;
   446         if (n->rn == RN_ALT) {
   447                 fork = re_insert(p, RI_FORK);
   448                 p->p[fork].a1 = p->n;
   449                 rnode_emit(n->c1, p);
   450                 done = re_insert(p, RI_JUMP);
   451                 p->p[fork].a2 = p->n;
   452                 rnode_emit(n->c2, p);
   453                 p->p[done].a1 = p->n;
   454         }
   455         if (n->rn == RN_CAT) {
   456                 rnode_emit(n->c1, p);
   457                 rnode_emit(n->c2, p);
   458         }
   459         if (n->rn == RN_GRP) {
   460                 mark = re_insert(p, RI_MARK);
   461                 p->p[mark].mark = 2 * n->grp;
   462                 rnode_emit(n->c1, p);
   463                 mark = re_insert(p, RI_MARK);
   464                 p->p[mark].mark = 2 * n->grp + 1;
   465         }
   466         if (n->rn == RN_ATOM) {
   467                 int atom = re_insert(p, RI_ATOM);
   468                 ratom_copy(&p->p[atom].ra, &n->ra);
   469         }
   470 }
   471 
   472 static void rnode_emit(struct rnode *n, struct regex *p)
   473 {
   474         int last;
   475         int jmpend[NREPS];
   476         int jmpend_cnt = 0;
   477         int i;
   478         if (!n)
   479                 return;
   480         if (n->mincnt == 0 && n->maxcnt == 0)
   481                 return;
   482         if (n->mincnt == 1 && n->maxcnt == 1) {
   483                 rnode_emitnorep(n, p);
   484                 return;
   485         }
   486         if (n->mincnt == 0) {
   487                 int fork = re_insert(p, RI_FORK);
   488                 p->p[fork].a1 = p->n;
   489                 jmpend[jmpend_cnt++] = fork;
   490         }
   491         for (i = 0; i < MAX(1, n->mincnt); i++) {
   492                 last = p->n;
   493                 rnode_emitnorep(n, p);
   494         }
   495         if (n->maxcnt < 0) {
   496                 int fork;
   497                 fork = re_insert(p, RI_FORK);
   498                 p->p[fork].a1 = last;
   499                 p->p[fork].a2 = p->n;
   500         }
   501         for (i = MAX(1, n->mincnt); i < n->maxcnt; i++) {
   502                 int fork = re_insert(p, RI_FORK);
   503                 p->p[fork].a1 = p->n;
   504                 jmpend[jmpend_cnt++] = fork;
   505                 rnode_emitnorep(n, p);
   506         }
   507         for (i = 0; i < jmpend_cnt; i++)
   508                 p->p[jmpend[i]].a2 = p->n;
   509 }
   510 
   511 int regcomp(regex_t *preg, char *pat, int flg)
   512 {
   513         struct rnode *rnode = rnode_parse(&pat);
   514         struct regex *re;
   515         int n = rnode_count(rnode) + 3;
   516         int mark;
   517         if (!rnode)
   518                 return 1;
   519         rnode_grpnum(rnode, 1);
   520         re = malloc(sizeof(*re));
   521         memset(re, 0, sizeof(*re));
   522         re->p = malloc(n * sizeof(re->p[0]));
   523         memset(re->p, 0, n * sizeof(re->p[0]));
   524         mark = re_insert(re, RI_MARK);
   525         re->p[mark].mark = 0;
   526         rnode_emit(rnode, re);
   527         mark = re_insert(re, RI_MARK);
   528         re->p[mark].mark = 1;
   529         mark = re_insert(re, RI_MATCH);
   530         rnode_free(rnode);
   531         re->flg = flg;
   532         *preg = re;
   533         return 0;
   534 }
   535 
   536 void regfree(regex_t *preg)
   537 {
   538         struct regex *re = *preg;
   539         int i;
   540         for (i = 0; i < re->n; i++)
   541                 if (re->p[i].ri == RI_ATOM)
   542                         free(re->p[i].ra.s);
   543         free(re->p);
   544         free(re);
   545 }
   546 
   547 static int re_rec(struct regex *re, struct rstate *rs)
   548 {
   549         struct rinst *ri = NULL;
   550         if (rs->dep >= NDEPT)
   551                 return 1;
   552         rs->dep++;
   553         while (1) {
   554                 ri = &re->p[rs->pc];
   555                 if (ri->ri == RI_ATOM) {
   556                         if (ratom_match(&ri->ra, rs))
   557                                 return 1;
   558                         rs->pc++;
   559                         continue;
   560                 }
   561                 if (ri->ri == RI_MARK) {
   562                         if (ri->mark < NGRPS)
   563                                 rs->mark[ri->mark] = rs->s - rs->o;
   564                         rs->pc++;
   565                         continue;
   566                 }
   567                 if (ri->ri == RI_JUMP) {
   568                         rs->pc = ri->a1;
   569                         continue;
   570                 }
   571                 if (ri->ri == RI_FORK) {
   572                         struct rstate base = *rs;
   573                         rs->pc = ri->a1;
   574                         if (!re_rec(re, rs))
   575                                 return 0;
   576                         *rs = base;
   577                         rs->pc = ri->a2;
   578                         continue;
   579                 }
   580                 break;
   581         }
   582         rs->dep--;
   583         return ri->ri != RI_MATCH;
   584 }
   585 
   586 static int re_recmatch(struct regex *re, struct rstate *rs, int nsub, regmatch_t *psub)
   587 {
   588         int i;
   589         rs->pc = 0;
   590         for (i = 0; i < LEN(rs->mark); i++)
   591                 rs->mark[i] = -1;
   592         rs->dep = 0;
   593         if (!re_rec(re, rs)) {
   594                 for (i = 0; i < nsub; i++) {
   595                         psub[i].rm_so = i * 2 < LEN(rs->mark) ? rs->mark[i * 2] : -1;
   596                         psub[i].rm_eo = i * 2 < LEN(rs->mark) ? rs->mark[i * 2 + 1] : -1;
   597                 }
   598                 return 0;
   599         }
   600         return 1;
   601 }
   602 
   603 int regexec(regex_t *preg, char *s, int nsub, regmatch_t psub[], int flg)
   604 {
   605         struct regex *re = *preg;
   606         struct rstate rs;
   607         memset(&rs, 0, sizeof(rs));
   608         rs.flg = re->flg | flg;
   609         rs.o = s;
   610         while (*s) {
   611                 rs.s = s;
   612                 s += uc_len(s);
   613                 if (!re_recmatch(re, &rs, flg & REG_NOSUB ? 0 : nsub, psub))
   614                         return 0;
   615         }
   616         return 1;
   617 }
   618 
   619 int regerror(int errcode, regex_t *preg, char *errbuf, int errbuf_size)
   620 {
   621         return 0;
   622 }