utf8pad: improve padded printing and printing invalid unicode characters - stagit-gopher - A git gopher frontend. (mirror)
git clone git://bitreich.org/stagit-gopher/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/stagit-gopher/
Log
Files
Refs
Tags
README
LICENSE
---
commit 554a9fe2e9d12defd9d6253871d8261d3f3ef3c6
parent 7b93d02cd8f26ab9a25d967c72c359a22c91eb74
Author: Hiltjo Posthuma 
Date:   Sat,  9 Jan 2021 14:56:51 +0100

utf8pad: improve padded printing and printing invalid unicode characters

- Use unicode replacement character (codepoint 0xfffd) when a codepoint is
  invalid and proceed printing the rest of the characters.

- When a codepoint is invalid reset the internal state of mbtowc(3), from the
  OpenBSD man page:

  "  If a call to mbtowc() resulted in an undefined internal state, mbtowc()
     must be called with s set to NULL to reset the internal state before it
     can safely be used again."

- Make the function return 0 when `len` is 0 (this should not be not an error).

Diffstat:
  M stagit-gopher-index.c               |      59 ++++++++++++++++++++++---------
  M stagit-gopher.c                     |      58 ++++++++++++++++++++++---------

2 files changed, 83 insertions(+), 34 deletions(-)
---
diff --git a/stagit-gopher-index.c b/stagit-gopher-index.c
@@ -10,6 +10,9 @@
 
 #include 
 
+#define PAD_TRUNCATE_SYMBOL    "\xe2\x80\xa6" /* symbol: "ellipsis" */
+#define UTF_INVALID_SYMBOL     "\xef\xbf\xbd" /* symbol: "replacement" */
+
 static git_repository *repo;
 
 static const char *relpath = "";
@@ -17,40 +20,62 @@ static const char *relpath = "";
 static char description[255] = "Repositories";
 static char *name = "";
 
-/* format `len' columns of characters. If string is shorter pad the rest
+/* Format `len' columns of characters. If string is shorter pad the rest
  * with characters `pad`. */
 int
 utf8pad(char *buf, size_t bufsiz, const char *s, size_t len, int pad)
 {
         wchar_t wc;
         size_t col = 0, i, slen, siz = 0;
-        int rl, w;
+        int inc, rl, w;
 
-        if (!len)
+        if (!bufsiz)
                 return -1;
+        if (!len) {
+                buf[0] = '\0';
+                return 0;
+        }
 
         slen = strlen(s);
-        for (i = 0; i < slen; i += rl) {
-                if ((rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4)) <= 0)
-                        break;
-                if ((w = wcwidth(wc)) == -1)
+        for (i = 0; i < slen; i += inc) {
+                inc = 1;
+                if ((unsigned char)s[i] < 32)
                         continue;
-                if (col + w > len || (col + w == len && s[i + rl])) {
+
+                rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4);
+                if (rl < 0) {
+                        mbtowc(NULL, NULL, 0); /* reset state */
+                        inc = 1; /* next byte */
+                        w = 1; /* replacement char is one width */
+                } else if ((w = wcwidth(wc)) == -1) {
+                        continue;
+                } else {
+                        inc = rl;
+                }
+
+                if (col + w > len || (col + w == len && s[i + inc])) {
                         if (siz + 4 >= bufsiz)
                                 return -1;
-                        memcpy(&buf[siz], "\xe2\x80\xa6", 3);
-                        siz += 3;
-                        if (col + w == len && w > 1)
-                                buf[siz++] = pad;
+                        memcpy(&buf[siz], PAD_TRUNCATE_SYMBOL, sizeof(PAD_TRUNCATE_SYMBOL) - 1);
+                        siz += sizeof(PAD_TRUNCATE_SYMBOL) - 1;
                         buf[siz] = '\0';
-                        return 0;
+                        col++;
+                        break;
+                } else if (rl < 0) {
+                        if (siz + 4 >= bufsiz)
+                                return -1;
+                        memcpy(&buf[siz], UTF_INVALID_SYMBOL, sizeof(UTF_INVALID_SYMBOL) - 1);
+                        siz += sizeof(UTF_INVALID_SYMBOL) - 1;
+                        buf[siz] = '\0';
+                        col++;
+                        continue;
                 }
-                if (siz + rl + 1 >= bufsiz)
+                if (siz + inc + 1 >= bufsiz)
                         return -1;
-                memcpy(&buf[siz], &s[i], rl);
-                col += w;
-                siz += rl;
+                memcpy(&buf[siz], &s[i], inc);
+                siz += inc;
                 buf[siz] = '\0';
+                col += w;
         }
 
         len -= col;
diff --git a/stagit-gopher.c b/stagit-gopher.c
@@ -19,6 +19,8 @@
 #include "compat.h"
 
 #define LEN(s)    (sizeof(s)/sizeof(*s))
+#define PAD_TRUNCATE_SYMBOL    "\xe2\x80\xa6" /* symbol: "ellipsis" */
+#define UTF_INVALID_SYMBOL     "\xef\xbf\xbd" /* symbol: "replacement" */
 
 struct deltainfo {
         git_patch *patch;
@@ -80,40 +82,62 @@ static char lastoidstr[GIT_OID_HEXSZ + 2]; /* id + newline + NUL byte */
 static FILE *rcachefp, *wcachefp;
 static const char *cachefile;
 
-/* format `len' columns of characters. If string is shorter pad the rest
+/* Format `len' columns of characters. If string is shorter pad the rest
  * with characters `pad`. */
 int
 utf8pad(char *buf, size_t bufsiz, const char *s, size_t len, int pad)
 {
         wchar_t wc;
         size_t col = 0, i, slen, siz = 0;
-        int rl, w;
+        int inc, rl, w;
 
-        if (!len)
+        if (!bufsiz)
                 return -1;
+        if (!len) {
+                buf[0] = '\0';
+                return 0;
+        }
 
         slen = strlen(s);
-        for (i = 0; i < slen; i += rl) {
-                if ((rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4)) <= 0)
-                        break;
-                if ((w = wcwidth(wc)) == -1)
+        for (i = 0; i < slen; i += inc) {
+                inc = 1;
+                if ((unsigned char)s[i] < 32)
+                        continue;
+
+                rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4);
+                if (rl < 0) {
+                        mbtowc(NULL, NULL, 0); /* reset state */
+                        inc = 1; /* next byte */
+                        w = 1; /* replacement char is one width */
+                } else if ((w = wcwidth(wc)) == -1) {
                         continue;
-                if (col + w > len || (col + w == len && s[i + rl])) {
+                } else {
+                        inc = rl;
+                }
+
+                if (col + w > len || (col + w == len && s[i + inc])) {
                         if (siz + 4 >= bufsiz)
                                 return -1;
-                        memcpy(&buf[siz], "\xe2\x80\xa6", 3);
-                        siz += 3;
-                        if (col + w == len && w > 1)
-                                buf[siz++] = pad;
+                        memcpy(&buf[siz], PAD_TRUNCATE_SYMBOL, sizeof(PAD_TRUNCATE_SYMBOL) - 1);
+                        siz += sizeof(PAD_TRUNCATE_SYMBOL) - 1;
                         buf[siz] = '\0';
-                        return 0;
+                        col++;
+                        break;
+                } else if (rl < 0) {
+                        if (siz + 4 >= bufsiz)
+                                return -1;
+                        memcpy(&buf[siz], UTF_INVALID_SYMBOL, sizeof(UTF_INVALID_SYMBOL) - 1);
+                        siz += sizeof(UTF_INVALID_SYMBOL) - 1;
+                        buf[siz] = '\0';
+                        col++;
+                        continue;
                 }
-                if (siz + rl + 1 >= bufsiz)
+                if (siz + inc + 1 >= bufsiz)
                         return -1;
-                memcpy(&buf[siz], &s[i], rl);
-                col += w;
-                siz += rl;
+                memcpy(&buf[siz], &s[i], inc);
+                siz += inc;
                 buf[siz] = '\0';
+                col += w;
         }
 
         len -= col;