| ---
tregen.c (2446B)
---
1 #include
2 #include
3 #include
4 #include
5 #include "dfa.h"
6
7 /***
8 * Regular expression for matching.
9 */
10
11 char *ignore[] =
12 {
13 /* HTML that isn't A, IMG, or FONT */
14 /* Must have a space somewhere to avoid catching */
15 "<[ \n\r]*("
16 "[^aif]|"
17 "a[^> \t\r\n]|"
18 "i[^mM \t\r\n]|"
19 "im[^gG \t\r\n]|"
20 "img[^> \t\r\n]|"
21 "f[^oO \t\r\n]|"
22 "fo[^Nn \t\r\n]|"
23 "fon[^tT \t\r\n]|"
24 "font[^> \r\t\n]"
25 ")[^>]*[ \t\n\r][^>]*>",
26 "<[ \n\r]*("
27 "i|im|f|fo|fon"
28 ")[ \t\r\n][^>]*>",
29
30 /* ignore html comments */
31 "",
32
33 /* random mail strings */
34 "^message-id:.*\n([ ].*\n)*",
35 "^in-reply-to:.*\n([ ].*\n)*",
36 "^references:.*\n([ ].*\n)*",
37 "^date:.*\n([ ].*\n)*",
38 "^delivery-date:.*\n([ ].*\n)*",
39 "e?smtp id .*",
40 "^ id.*",
41 "boundary=.*",
42 "name=\"",
43 "filename=\"",
44 "news:<[^>]+>",
45 "^--[^ ]*$",
46
47 /* base64 encoding */
48 "^[0-9a-zA-Z+\\-=/]+$",
49
50 /* uu encoding */
51 "^[!-Z]+$",
52
53 /* little things */
54 ".",
55 "\n"
56 };
57
58 char *keywords[] =
59 {
60 "([a-zA-Z'`$!¡-]|[0-9]([.,][0-9])*)+"
61 };
62
63 int debug;
64
65 Dreprog*
66 dregcomp(char *buf)
67 {
68 Reprog *r;
69 Dreprog *d;
70
71 if(debug)
72 print(">>> '%s'\n", buf);
73
74 r = regcomp(buf);
75 if(r == nil)
76 sysfatal("regcomp");
77 d = dregcvt(r);
78 if(d == nil)
79 sysfatal("dregcomp");
80 free(r);
81 return d;
82 }
83
84 char*
85 strcpycase(char *d, char *s)
86 {
87 int cc, esc;
88
89 cc = 0;
90 esc = 0;
91 while(*s){
92 if(*s == '[')
93 cc++;
94 if(*s == ']')
95 cc--;
96 if(!cc && 'a' <= *s && *s <= 'z'){
97 *d++ = '[';
98 *d++ = *s;
99 *d++ = *s+'A'-'a';
100 *d++ = ']';
101 }else
102 *d++ = *s;
103 if(*s == '\\')
104 esc++;
105 else if(esc)
106 esc--;
107 s++;
108 }
109 return d;
110 }
111
112 void
113 regerror(char *msg)
114 {
115 sysfatal("regerror: %s", msg);
116 }
117
118 void
119 buildre(Dreprog *re[3])
120 {
121 int i;
122 static char buf[16384], *s;
123
124 re[0] = dregcomp("^From ");
125
126 s = buf;
127 for(i=0; i |