trstr: faster searching for non-regex patterns - neatvi - [fork] simple vi-type editor with UTF-8 support
git clone git://src.adamsgaard.dk/neatvi
Log
Files
Refs
README
---
commit 2c0f3f357bcd4446492330f2313464c3c6348343
parent 7df341b1420aeeae6a9da53a5209762eda00341d
Author: Ali Gholami Rudi 
Date:   Sat, 20 Nov 2021 21:57:26 +0330

rstr: faster searching for non-regex patterns

Diffstat:
  M Makefile                            |       2 +-
  M ex.c                                |      34 +++++++++++++++----------------
  M mot.c                               |       8 ++++----
  A rstr.c                              |     115 +++++++++++++++++++++++++++++++
  M vi.h                                |       8 ++++++--

5 files changed, 142 insertions(+), 25 deletions(-)
---
diff --git a/Makefile b/Makefile
t@@ -3,7 +3,7 @@ CFLAGS = -Wall -O2
 LDFLAGS =
 
 OBJS = vi.o ex.o lbuf.o mot.o sbuf.o ren.o dir.o syn.o reg.o led.o \
-        uc.o term.o rset.o regex.o cmd.o conf.o
+        uc.o term.o rset.o rstr.o regex.o cmd.o conf.o
 
 all: vi
 
diff --git a/ex.c b/ex.c
t@@ -167,8 +167,8 @@ static int ex_search(char **pat)
         struct sbuf *kw;
         char *b = *pat;
         char *e = b;
-        char *pats[1];
-        struct rset *re;
+        char *pat_re;
+        struct rstr *re;
         int dir, row;
         kw = sbuf_make();
         while (*++e) {
t@@ -182,18 +182,18 @@ static int ex_search(char **pat)
                 ex_kwdset(sbuf_buf(kw), **pat == '/' ? 1 : -1);
         sbuf_free(kw);
         *pat = *e ? e + 1 : e;
-        if (ex_kwd(&pats[0], &dir))
+        if (ex_kwd(&pat_re, &dir))
                 return -1;
-        re = rset_make(1, pats, xic ? RE_ICASE : 0);
+        re = rstr_make(pat_re, xic ? RE_ICASE : 0);
         if (!re)
                 return -1;
         row = xrow + dir;
         while (row >= 0 && row < lbuf_len(xb)) {
-                if (rset_find(re, lbuf_get(xb, row), 0, NULL, 0) >= 0)
+                if (rstr_find(re, lbuf_get(xb, row), 0, NULL, 0) >= 0)
                         break;
                 row += dir;
         }
-        rset_free(re);
+        rstr_free(re);
         return row >= 0 && row < lbuf_len(xb) ? row : -1;
 }
 
t@@ -604,10 +604,9 @@ static void replace(struct sbuf *dst, char *rep, char *ln, int *offs)
 
 static int ec_substitute(char *loc, char *cmd, char *arg)
 {
-        struct rset *re;
+        struct rstr *re;
         int offs[32];
         int beg, end;
-        char *pats[1];
         char *pat = NULL, *rep = NULL;
         char *s = arg;
         int i;
t@@ -624,15 +623,15 @@ static int ec_substitute(char *loc, char *cmd, char *arg)
                 snprintf(xrep, sizeof(xrep), "%s", rep ? rep : "");
         free(pat);
         free(rep);
-        if (ex_kwd(&pats[0], NULL))
+        if (ex_kwd(&pat, NULL))
                 return 1;
-        re = rset_make(1, pats, xic ? RE_ICASE : 0);
+        re = rstr_make(pat, xic ? RE_ICASE : 0);
         if (!re)
                 return 1;
         for (i = beg; i < end; i++) {
                 char *ln = lbuf_get(xb, i);
                 struct sbuf *r = NULL;
-                while (rset_find(re, ln, LEN(offs) / 2, offs, 0) >= 0) {
+                while (rstr_find(re, ln, LEN(offs) / 2, offs, 0) >= 0) {
                         if (!r)
                                 r = sbuf_make();
                         sbuf_mem(r, ln, offs[0]);
t@@ -649,7 +648,7 @@ static int ec_substitute(char *loc, char *cmd, char *arg)
                         sbuf_free(r);
                 }
         }
-        rset_free(re);
+        rstr_free(re);
         return 0;
 }
 
t@@ -716,10 +715,9 @@ static int ex_exec(char *ln);
 
 static int ec_glob(char *loc, char *cmd, char *arg)
 {
-        struct rset *re;
+        struct rstr *re;
         int offs[32];
         int beg, end, not;
-        char *pats[1];
         char *pat;
         char *s = arg;
         int i;
t@@ -732,9 +730,9 @@ static int ec_glob(char *loc, char *cmd, char *arg)
         if (pat && pat[0])
                 ex_kwdset(pat, +1);
         free(pat);
-        if (ex_kwd(&pats[0], NULL))
+        if (ex_kwd(&pat, NULL))
                 return 1;
-        if (!(re = rset_make(1, pats, xic ? RE_ICASE : 0)))
+        if (!(re = rstr_make(pat, xic ? RE_ICASE : 0)))
                 return 1;
         xgdep++;
         for (i = beg + 1; i < end; i++)
t@@ -742,7 +740,7 @@ static int ec_glob(char *loc, char *cmd, char *arg)
         i = beg;
         while (i < lbuf_len(xb)) {
                 char *ln = lbuf_get(xb, i);
-                if ((rset_find(re, ln, LEN(offs) / 2, offs, 0) < 0) == not) {
+                if ((rstr_find(re, ln, LEN(offs) / 2, offs, 0) < 0) == not) {
                         xrow = i;
                         if (ex_exec(s))
                                 break;
t@@ -754,7 +752,7 @@ static int ec_glob(char *loc, char *cmd, char *arg)
         for (i = 0; i < lbuf_len(xb); i++)
                 lbuf_globget(xb, i, xgdep);
         xgdep--;
-        rset_free(re);
+        rstr_free(re);
         return 0;
 }
 
diff --git a/mot.c b/mot.c
t@@ -55,13 +55,13 @@ int lbuf_search(struct lbuf *lb, char *kw, int dir, int *r, int *o, int *len)
         int found = 0;
         int r0 = *r, o0 = *o;
         int i;
-        struct rset *re = rset_make(1, &kw, xic ? RE_ICASE : 0);
+        struct rstr *re = rstr_make(kw, xic ? RE_ICASE : 0);
         if (!re)
                 return 1;
         for (i = r0; !found && i >= 0 && i < lbuf_len(lb); i += dir) {
                 char *s = lbuf_get(lb, i);
                 int off = dir > 0 && r0 == i ? uc_chr(s, o0 + 1) - s : 0;
-                while (rset_find(re, s + off, 1, offs,
+                while (rstr_find(re, s + off, 1, offs,
                                 off ? RE_NOTBOL : 0) >= 0) {
                         if (dir < 0 && r0 == i &&
                                         uc_off(s, off + offs[0]) >= o0)
t@@ -71,11 +71,11 @@ int lbuf_search(struct lbuf *lb, char *kw, int dir, int *r, int *o, int *len)
                         *r = i;
                         *len = uc_off(s + off + offs[0], offs[1] - offs[0]);
                         off += offs[1] > offs[0] ? offs[1] : offs[1] + 1;
-                        if (dir > 0)
+                        if (dir > 0 || !s[off])
                                 break;
                 }
         }
-        rset_free(re);
+        rstr_free(re);
         return !found;
 }
 
diff --git a/rstr.c b/rstr.c
t@@ -0,0 +1,115 @@
+#include 
+#include 
+#include 
+#include 
+#include "vi.h"
+
+struct rstr {
+        struct rset *rs;        /* the compiled regular expression */
+        char *str;                /* simple search string */
+        int icase;
+        int lbeg, lend;
+        int wbeg, wend;
+};
+
+static int rstr_simple(struct rstr *rs, char *re)
+{
+        char *beg;
+        char *end;
+        rs->lbeg = re[0] == '^';
+        if (rs->lbeg)
+                re++;
+        rs->wbeg = re[0] == '\\' && re[1] == '<';
+        if (rs->wbeg)
+                re += 2;
+        beg = re;
+        while (re[0] && !strchr("\\.*+?[]{}()$", (unsigned char) re[0]))
+                re++;
+        end = re;
+        rs->wend = re[0] == '\\' && re[1] == '>';
+        if (rs->wend)
+                re += 2;
+        rs->lend = re[0] == '$';
+        if (rs->lend)
+                re++;
+        if (!re[0]) {
+                int len = end - beg;
+                rs->str = malloc(len + 1);
+                memcpy(rs->str, beg, len);
+                rs->str[len] = '\0';
+                return 0;
+        }
+        return 1;
+}
+
+struct rstr *rstr_make(char *re, int flg)
+{
+        struct rstr *rs = malloc(sizeof(*rs));
+        memset(rs, 0, sizeof(*rs));
+        rs->icase = flg & RE_ICASE;
+        if (rstr_simple(rs, re))
+                rs->rs = rset_make(1, &re, flg);
+        if (!rs->rs && !rs->str) {
+                free(rs);
+                return NULL;
+        }
+        return rs;
+}
+
+static int isword(char *s)
+{
+        int c = (unsigned char) s[0];
+        return isalnum(c) || c == '_' || c > 127;
+}
+
+static int match_case(char *s, char *r, int icase)
+{
+        for (; *r && *s; s++, r++) {
+                if (!icase && *s != *r)
+                        return 1;
+                if (icase && tolower((unsigned char) *s) != tolower((unsigned char) *r))
+                        return 1;
+        }
+        return 0;
+}
+
+/* return zero if an occurrence is found */
+int rstr_find(struct rstr *rs, char *s, int n, int *grps, int flg)
+{
+        int len;
+        char *beg, *end;
+        char *r;
+        if (rs->rs)
+                return rset_find(rs->rs, s, n, grps, flg);
+        if ((rs->lbeg && (flg & RE_NOTBOL)) || (rs->lend && (flg & RE_NOTEOL)))
+                return -1;
+        len = strlen(rs->str);
+        beg = s;
+        end = s + strlen(s) - len - 1;
+        if (rs->lbeg)
+                end = beg;
+        if (rs->lend)
+                beg = end;
+        for (r = beg; r <= end; r++) {
+                if (rs->wbeg && r > s && (isword(r - 1) || !isword(r)))
+                        continue;
+                if (rs->wend && r[len] && (!isword(r + len - 1) || isword(r + len)))
+                        continue;
+                if (!match_case(r, rs->str, rs->icase)) {
+                        if (n >= 1) {
+                                grps[0] = r - s;
+                                grps[1] = r - s + len;
+                        }
+                        return 0;
+                }
+        }
+        return -1;
+}
+
+void rstr_free(struct rstr *rs)
+{
+        if (rs->rs)
+                rset_free(rs->rs);
+        free(rs->str);
+        free(rs);
+}
diff --git a/vi.h b/vi.h
t@@ -45,15 +45,19 @@ void sbuf_printf(struct sbuf *sbuf, char *s, ...);
 int sbuf_len(struct sbuf *sb);
 void sbuf_cut(struct sbuf *s, int len);
 
-/* regular expression sets */
+/* regular expressions */
 #define RE_ICASE                1
 #define RE_NOTBOL                2
 #define RE_NOTEOL                4
-
+/* regular expression sets: searching for multiple regular expressions */
 struct rset *rset_make(int n, char **pat, int flg);
 int rset_find(struct rset *re, char *s, int n, int *grps, int flg);
 void rset_free(struct rset *re);
 char *re_read(char **src);
+/* searching for a single pattern regular expression */
+struct rstr *rstr_make(char *re, int flg);
+int rstr_find(struct rstr *rs, char *s, int n, int *grps, int flg);
+void rstr_free(struct rstr *rs);
 
 /* rendering lines */
 int *ren_position(char *s);