proxy70

/*
   U T F 8 . C

   Dumps a selected portion of Unicode Plane 0 in UTF8 to standard output.

   Output is one line per character:
     [c]  xxxx  name

   where:
     c is the character in UTF-8,
     xxxx is the 4-digit hex code,
     name is the character's name from the Unicode database.

   Usage:
     utf8                Dump all of BMP except controls.
     utf8 hex            Dump <hex> through FFFF.
     utf8 hex1 hex2      Dump <hex1> through <hex2>.
     utf8 -w [hex [hex]] As above but suitable for the Web.
     utf8 -p directory   Include this to specify directory for database files.

   Default location for Unicode database files is:
     /pub/ftp/kermit/charsets/
     /www/data/ftp/kermit/charsets/

   Obtain up-to-date copies of database files from:
     http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
     http://www.unicode.org/Public/UNIDATA/extracted/DerivedBidiClass.txt

   If the Unicode database can't be found, the characters are dumped
   without names.

   Assumes Unicode database is in ascending order with one record
   per line: first field is hex code, second field is name; field
   separator is semicolon (;).  The DerivedBidiClass file need not be
   in code order.

   ANSI C required.

   F. da Cruz, Columbia University, May 2000.

   Updated 10 June 2003: new -w option makes a Web version that:
    - Puts &lt; for <, &gt; for >, &amp; for &.
    - Puts a space before each combining mark
    - Puts U+200E (LTRM) after each RTL character.
    - Automatically substitutes space for control/formatting characters.

   Updated 18 Jun 2003:
    - new -p option to specify path for database files.
    - Reads DerivedBidiClass.txt to get BIDI class for undefined code points.
    - getfields() strips leading and trailing blanks from each field.

   Updated 25 Jun 2003:
    - Show hex code as U+xxxx to avoid having the digits in certain entries
      turned into Hindi digits (don't ask!)
y*/
#include <stdio.h>

#define USHORT unsigned short
#define ULONG unsigned long
#define CONST const
#define CHAR unsigned char
#ifndef MAXPATHLEN
#define MAXPATHLEN 1024
#endif /* MAXPATHLEN */

CHAR firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};

/* Default directories for Unicode database files */

char * ucdata = "/pub/ftp/kermit/charsets/";
char * ucdata2 = "/www/data/ftp/kermit/charsets/";

char * argv0;				/* My name */
char ltrm[4];				/* Left To Right Mark */
char line[1024];			/* Database line buffer */
char * field[16];			/* Fields within line */

struct lohi {				/* Struct for ranges */
    int lo;
    int hi;
};

struct lohi bidi[256];			/* Default RTL blocks */
int b = 0;				/* Number of RTL blocks */

int
ucs2_to_utf8(USHORT ucs2, CHAR ** utf8) { /* Convert UCS-2 to UTF-8 */

    static CHAR utf8return[8]={0,0,0,0,0,0,0,0};
    register CONST ULONG byteMask = 0xBF;
    register CONST ULONG byteMark = 0x80;
    int utf8len = 0;
    int i = 0;

    if (ucs2 < 0x80) {
        utf8len = 1;
    } else if (ucs2 < 0x800) {
        utf8len = 2;
    } else
#ifdef DO_UCS4
      /* This is always true for UCS-2 but would be needed for UCS-4*/
      /* When ucs2 is USHORT this gives compiler warnings. */
      if (ucs2 <= 0xffff)
#endif /* DO_UCS4 */
    {
        utf8len = 3;
    }
#ifdef DO_UCS4
/* The following would be for UCS-4 */
    else if (ucs2 < 0x200000) {
        utf8len = 4;
    } else if (ucs2 < 0x4000000) {
        utf8len = 5;
    } else if (ucs2 <= 0x7FFFFFFFUL) {	/* 31 bits = max for UCS4 */
        utf8len = 6;
    } else {
        utf8len = 2;
        ucs2 = 0xFFFD;                  /* Replacement for invalid char */
    }
#endif /* DO_UCS4 */
    i = utf8len;                        /* index into utf8return */
    utf8return[i--] = 0;                /* Null terminate the string */

    switch (utf8len) {	                /* code falls through cases! */
      case 6: utf8return[i--] = (ucs2 | byteMark) & byteMask; ucs2 >>= 6;
      case 5: utf8return[i--] = (ucs2 | byteMark) & byteMask; ucs2 >>= 6;
      case 4: utf8return[i--] = (ucs2 | byteMark) & byteMask; ucs2 >>= 6;
      case 3: utf8return[i--] = (ucs2 | byteMark) & byteMask; ucs2 >>= 6;
      case 2: utf8return[i--] = (ucs2 | byteMark) & byteMask; ucs2 >>= 6;
      case 1: utf8return[i--] =  ucs2 | firstByteMark[utf8len];
    }
    *utf8 = utf8return;
    return(utf8len);
}

usage(s) char * s; {
    fprintf(stderr,"Usage: %s [-w] [hex [hex]]\n",s);
    exit(1);
}

int
hextoint(s) char * s; {			/* Convert hex string to integer */
    unsigned int x = 0;
    int d;
    char c;

    while (c = *s++) {
	if (c >= 'A' && c <= 'F')
	  d = c - 'A' + 10;
	else if (c >= 'a' && c <= 'f')
	  d = c - 'a' + 10;
	else if (c >= '0' && c <= '9')
	  d = c - '0';
	else
	  usage(argv0);
	x = (x << 4) + d;
    }
    return(x);
}

void
clearfields() {				/* Clear database fields */
    int i;
    for (i = 0; i < 16; i++)
      field[i] = (char *)0;
}

void
getfields(s) char *s; {			/* Get fields from database entry */
    int i = 0;
    char * p = s, * q = s, * r = s, * t;

    while (*p && i < 16) {
	if (*p == ';' || *p == '#') {	/* Field separator */
	    t = q;			/* Beginning of this field */
	    while (*t == ' ' || *t == '\t') /* Trim leading whitespace */
	      t++;
	    field[i++] = t;
	    *p = '\0';			/* End of this field */
	    t = p - 1;			/* Trim trailing whitespace */
	    while (t > r && *t && (*t == ' ' || *t == '\t')) {
		*t = '\0';
		t--;
	    }
	    if (*p == '#')		/* Comment introducer terminates */
	      return;
	    q = p+1;			/* Advance to next field */
	}
	p++;
    }
}

FILE *
fileopen(char * path, char * name) {
    char filename[MAXPATHLEN+2];	/* Buffer for filespec */
    int i, n;
    if (!path || !name)
      return((FILE *)0);
    n = (int) strlen(path);
    if (n + (int)strlen(name) > MAXPATHLEN)
      return((FILE *)0);
    if (n > 0) {
	strncpy(filename,path,MAXPATHLEN);
	if (path[n-1] != '/') {
	    filename[n++] = '/';
	}
    }
    strncpy(&filename[n],name,MAXPATHLEN-n+1);
    return(fopen(filename,"r"));
}

/* Given s == hex "XXXX" or "XXXX..XXXX" constructs lo,hi pair */

struct lohi
splitpair(char * s) {
    struct lohi x;
    char * p, * q;
    p = s;
    for (q = p; *q; q++)
      if (*q == '.')
	break;
    if (*q == '.') {
	while (*q == '.')
	  *q++ = '\0';
    }	    
    x.lo = hextoint(p);
    x.hi = *q ? hextoint(q) : x.lo;
    return(x);
}

int
main(argc,argv) int argc; char *argv[]; {

    FILE * fp;				/* Unicode database file pointer */
    USHORT x;				/* Unicode values */
    CHAR * buf = NULL;			/* UTF-8 buffer pointer */
    char c, * s, * p, * bp;		/* Workers... */
    char * argv1 = (char *)0, * argv2 = (char *)0;
    int i, m, current = -1, all = 0, web = 0;
    int rtl, combining;
    unsigned int xx, from = 0, to = 0;
    struct lohi z;

    argv0 = argv[0];			/* My name */

    all = 1;
    for (i = 1; i < argc; i++) {	/* Parse command-line args */
	if (*argv[i] == '-') {	
	    if (!strcmp(argv[i],"-w")) {
		web++;
		continue;
	    } else if (!strcmp(argv[i],"-p")) {
		i++;
		ucdata = argv[i];
		ucdata2 = (char *)0;
	    } else {
		usage(argv0);	
	    }
	} else if (!argv1) {
	    argv1 = argv[i];
	    all = 0;
	} else if (!argv2) { 
	    argv2 = argv[i];
	} else 
	  usage(argv0); 
    }
    if (!argv1) argv1 = "20";		/* Supply defaults */
    if (!argv2) argv2 = "FFFF";

    bp = (char *)buf;

    x = hextoint("200E");		/* UTF-8 for LTRM */
    m = ucs2_to_utf8(x,&buf);
    if (m > 3) m = 3;
    for (i = 0; i < m; i++)
      ltrm[i] = buf[i];
    ltrm[3] = '\0';
    
    from = hextoint(argv1);		/* Get range as ints */
    if (from < 32)			/* Check range and sanity */
      usage(argv0);
    to = hextoint(argv2);
    if (to > 0xffff)
      usage(argv0);

/* Load table of default BIDI class for character blocks */

    fp = fileopen(ucdata,"DerivedBidiClass.txt");
    if (!fp && ucdata2)
      fp = fileopen(ucdata2,"DerivedBidiClass.txt");
    while (fp) {			/* Get entry for code x */
	if ((bp = fgets(line,1023,fp))) { /* Read a line */
	    clearfields();
	    if (line[0] == '#' || !line[0])
	      continue;
	    getfields(line);		/* Separate fields */
	    if (!field[0]) continue;	/* Comment or blank line */
	    if (!field[1]) continue;	/* No properties */
	    z = splitpair(field[0]);
	    if (*(field[1]) == 'R' || !strcmp(field[1],"AL")) {
		bidi[b] = z;		/* (might not be portable)*/
		b++;
		if (b > 254) {
		    fprintf(stderr,"Too many BIDI blocks (255 max)\n");
		    exit(1);
		}
	    }
	} else
	  break;
    }
    if (fp) {				/* Close file */
	fclose(fp);
	fp = (FILE *)0;
    }
    if (b > 1) {			/* Have to sort these */
	int i, j;			/* Bubble sort is fine */
	struct lohi t;			/* it's a small array */
	for (i = 0; i < b-1; i++) {
	    for (j = i+1; j < b; j++) {
		if (bidi[i].lo > bidi[j].lo) {
		    t = bidi[i];	/* warning: struct assignment */
		    bidi[i] = bidi[j];	/* might not be portable */
		    bidi[j] = t;
		}
	    }
	}
    }

/* Open Unicode Character Database file */

    fp = fileopen(ucdata,"UnicodeData.txt"); /* Unicode Data file */
    if (!fp && ucdata2)
      fp = fileopen(ucdata2,"UnicodeData.txt");

/* Main loop... */

    clearfields;			/* Initialize database fields */
    for (xx = from; xx <= to; xx++) {	/* Loop through range */
	x = xx;				/* Convert index to unsigned short */
	if (all && (x == 0x7F || (x >= 0x80 && x < 0xA0))) /* Skip controls */
	  continue;
	if (web) {			/* If making a Web table */
	    /* Including Han crashes all known browsers (2003) */
	    if (x >= 0x2b0e && x <= 0x303f || /* Skip CJK */
		x >= 0x3130 && x <= 0x319f || /* but keep kana and bopomofu */
		x >= 0x3200 && x <= 0xfff8) {
		if (x == 0x2b0e || x == 0x3130 || x == 0x3200)
		  printf("...\n");
		continue;
	    }
	}
	p = (char *)0;			/* Initialize name */
	rtl = combining = 0;		/* and attributes */
	while (fp && current < x) {	/* Get entry for code x */
	    if ((bp = fgets(line,1023,fp))) { /* Read a record */
		if (line[0] == '#' || !line[0])
		  continue;
		getfields(line);	/* Separate the fields */
		current = (unsigned) hextoint(field[0]); /* Get code */
	    } else {			/* Read failed */
		fclose(fp);		/* Close the database */
		fp = NULL;		/* and don't try reading it again */
		break;
	    }
	}
	if (current == x) {		/* If it's the desired record */
	    p = field[1];		/* get name of this character */
	    if (web) {			/* and if making a web table */
		if (field[3])		/* get character properties */
		  combining = atoi(field[3]);
		if (field[4])
		  rtl = (*(field[4]) == 'R' || !strcmp(field[4],"AL"));
	    }
	} else {			/* This char is undefined */
	    int i;			/* Get its default bidi category */
	    for (i = 0; i < b; i++) {
		if (bidi[i].lo > x)	/* (Table is sorted) */
		  break;
		if (x >= bidi[i].lo && x <= bidi[i].hi) {
		    rtl = 1;
		    break;
		}
	    }
	}
	if (!p) p = "(unknown)";	/* Supply this if name not known */
	putchar('[');			/* Print UTF-8 character in brackets */
	buf = (CHAR *)0;		/* Initialize value */
	if (web) {			/* Sensitive HTML characters */
	    switch(x) {
	      case '<': buf = (CHAR *)"&lt;" ; m = 4; break;
	      case '&': buf = (CHAR *)"&amp;"; m = 5; break;
	      case '>': buf = (CHAR *)"&gt;" ; m = 4; break;
	      default:
		if (x == current && field[2]) {	/* Print Controls as Space */
		    char * t = field[2];
		    if (*t == 'C') {
			buf = (CHAR *)" ";
			m = 1;
		    } else if (*t == 'Z' && *(t+1) != 's') { /* LS and PS */
			buf = (CHAR *)" ";
			m = 1;
		    }
		}
	    }
	}
	if (!buf)			/* Anything but ">&<" or Control */
	  m = ucs2_to_utf8(x,&buf);	/* convert to UTF-8 */
	if (combining > 0)		/* If combining */
	  putchar(' ');			/* put a space to combine with. */
	for (i = 0; i < m; i++)		/* Copy UTF-8 bytes */
	  putchar(buf[i]);
	if (combining == 233 || combining == 234) /* Combining double */
	  putchar(' ');			          /* Another space after */
	if (rtl)			/* If RTL put LTR Mark after */
	  printf("%s",ltrm);

	putchar(']');			/* Closing bracket */
	printf("  U+%04X  %s\n",x,p);	/* Print hex code and name */
	if (current == x)		/* Clear data */
	  clearfields();
    }
}