add ascx

cs127 · Oct 18, 2024 · 9cc7586 · 9cc7586
1 parent 007d104
commit 9cc7586
Show file tree

Hide file tree

Showing 6 changed files with 553 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,11 @@
 # unicorn
 
-unicorn is a lightweight implementation of most of the standard C wide character functions, for platforms that don't support them, but still have a wide character type in the form of `wchar_t`, **as long as it is at least 16 bits (if unsigned) or 17 bits (if signed)**.
+unicorn is a lightweight implementation of most of the standard C wide character functions, for platforms that don't support them, but still have a wide character type in the form of `wchar_t` (mainly DJGPP).
+
+functions for converting extended ASCII codepages to unicode are also provided as an addon (see *ascx* at the end of the document).
+
+> [!IMPORTANT]
+> to be able to use unicorn, the `wchar_t` type in your compiling environment must be **at least 16 bits (if unsigned) or 17 bits (if signed)**.
 
 > [!NOTE]
 > this is just a hobby project.
@@ -23,7 +28,7 @@ the only exception being the `wchar_t` type. unicorn uses the standard `wchar_t`
 ## compatibility
 
 unicorn is *almost* C89-compatible, except that it needs to know the maximum possible value of the `wchar_t` type.
-if your compiling environment does not support C99 or newer, then unless your compiler itself predefines `WCHAR_MAX`, `__WCHAR_MAX`, or `__WCHAR_MAX__`, you need to manually define one of them during compile time (make sure to give it the correct value!).
+if your compiling environment does not support C99 or newer, then unless your compiler itself predefines `WCHAR_MAX`, `__WCHAR_MAX`, or `__WCHAR_MAX__`, you need to manually define one of them during compile time (make sure to give it the correct value! and remember, if the type is not large enough, none of this will work!).
 
 ## what's not implemented
 
@@ -54,3 +59,26 @@ if your compiling environment does not support C99 or newer, then unless your co
 * `MB_LEN_MAX` and `MB_CUR_MAX` macros (both evaluate to `4`, because the multibyte encoding is always UTF-8).
 * wide character related `stdlib.h` functions (e.g. `wcstombs`, `mbstowcs`, `mblen`).
 * nonstandard `mbstowc` function, which is an alternative to `mbtowc`, but expects a `wchar_t*` instead of `wchar_t`, to be able to read surrogate pairs in UTF-16.
+
+### ascx
+
+unicorn includes an addon called ascx, which you can find in the `ascx` subdirectory of the sources. it includes functions for converting strings from various extended ASCII codepages to Unicode.
+
+the functions are:
+* `ascxtowc`: for converting a single character.
+* `ascxstowcs`: for converting an entire string.
+
+the behaviour of the two functions are identical to `mbtowc` and `mbstowcs` respectively, except that they take an extra parameter to specify which codepage the extended ASCII string is encoded in.
+
+the currently supported codepages are:
+* IBM437
+* IBM850
+* IBM858
+* Windows-1252
+* ISO-8859-1 with C0 and C1 control characters
+
+the IBM codepages each have two variants:
+* `C0`: containing C0 control characters.
+* `C0_REP`: containing IBM dingbats in place of C0 control characters.
+
+unassigned codepoints (e.g. `$81` in Windows-1252) are converted to the replacement character (`U+FFFD`).
diff --git a/src/ascx/ascx.h b/src/ascx/ascx.h
@@ -0,0 +1,56 @@
+#ifndef UC_H_ASCX
+#define UC_H_ASCX
+
+
+#include <limits.h>
+#include <stddef.h>
+
+
+#ifndef UC_WINT_IMPL
+#define UC_WINT_IMPL
+
+typedef signed long int UC_wint_t;
+
+#define UC_WINT_MAX LONG_MAX
+#define UC_WINT_MIN LONG_MIN
+
+#define UC_WEOF (-1)
+
+#endif /* UC_WINT_IMPL */
+
+
+typedef enum UC_trans
+{
+	UC_TRANS_437_C0,     /* w/ C0 control characters. */
+	UC_TRANS_437_C0_REP, /* w/ dingbats instead of C0 control characters. */
+
+	UC_TRANS_850_C0,     /* w/ C0 control characters. */
+	UC_TRANS_850_C0_REP, /* w/ dingbats instead of C0 control characters. */
+
+	UC_TRANS_858_C0,     /* w/ C0 control characters. */
+	UC_TRANS_858_C0_REP, /* w/ dingbats instead of C0 control characters. */
+
+	UC_TRANS_1252,
+
+	UC_TRANS_8859_1,     /* w/ C0 and C1 control characters. */
+
+
+	UC_TRANS_COUNT
+}
+UC_trans_t;
+
+
+extern const wchar_t UC_TRANSTABLE_437_UPPER [128];
+extern const wchar_t UC_TRANSTABLE_850_UPPER [128];
+extern const wchar_t UC_TRANSTABLE_IBM_C0_REP [33];
+extern const wchar_t UC_TRANSTABLE_1252_UPPER [32];
+extern const wchar_t UC_EURO;
+
+
+UC_wint_t UC_ascxtowc(int c, UC_trans_t t);
+
+size_t
+UC_ascxstowcs(wchar_t* dest, const unsigned char* src, size_t n, UC_trans_t t);
+
+
+#endif /* UC_H_ASCX */
diff --git a/src/ascx/ascxfunc.c b/src/ascx/ascxfunc.c
@@ -0,0 +1,113 @@
+#include "ascx.h"
+
+#include <stddef.h>
+#include <string.h>
+
+
+#define UC_tablecopy(dest, src, n) memcpy(dest, src, (n) * sizeof(wchar_t))
+
+
+UC_wint_t UC_ascxtowc(int c, UC_trans_t t)
+{
+	unsigned char bc;
+	wchar_t wc;
+	size_t r;
+
+	bc = c;
+	r = UC_ascxstowcs(&wc, &bc, 1, t);
+
+	return r != (size_t)(-1) ? wc : UC_WEOF;
+}
+
+size_t
+UC_ascxstowcs(wchar_t* dest, const unsigned char* src, size_t n, UC_trans_t t)
+{
+	size_t i;
+	wchar_t table [256];
+
+	if (!dest)
+	{
+		/* no destination specified.
+		   simply return the length of the extended-ASCII string. */
+		return strlen(src);
+	}
+
+	if (t >= UC_TRANS_COUNT)
+	{
+		/* invalid transformation type. */
+		return -1;
+	}
+
+	/* set up the transformation table: */
+
+	/* 1. start with an identity transform. */
+
+	for (i = 0; i < 256; i++) table[i] = i;
+
+	/* 2. fill the C0 control codepoints with IBM dingbats if necessary. */
+
+	switch (t)
+	{
+	case UC_TRANS_437_C0_REP:
+	case UC_TRANS_850_C0_REP:
+	case UC_TRANS_858_C0_REP:
+		UC_tablecopy(&table[0x00], &UC_TRANSTABLE_IBM_C0_REP[0], 32);
+		UC_tablecopy(&table[0x7F], &UC_TRANSTABLE_IBM_C0_REP[32], 1);
+		break;
+	}
+
+	/* 3. fill the upper half of the codepage
+	      (ignoring the small differences).   */
+
+	switch (t)
+	{
+	case UC_TRANS_437_C0:
+	case UC_TRANS_437_C0_REP:
+		UC_tablecopy(&table[0x80], &UC_TRANSTABLE_437_UPPER[0], 128);
+		break;
+
+	case UC_TRANS_850_C0:
+	case UC_TRANS_850_C0_REP:
+	case UC_TRANS_858_C0:
+	case UC_TRANS_858_C0_REP:
+		UC_tablecopy(&table[0x80], &UC_TRANSTABLE_850_UPPER[0], 128);
+		break;
+
+	case UC_TRANS_1252:
+		UC_tablecopy(&table[0x80], &UC_TRANSTABLE_1252_UPPER[0], 32);
+		break;
+	}
+
+	/* 4. add the small differences (e.g. euro sign) */
+
+	switch (t)
+	{
+	case UC_TRANS_858_C0:
+	case UC_TRANS_858_C0_REP:
+		UC_tablecopy(&table[0xD5], &UC_EURO, 1);
+		break;
+	}
+
+	/* and we're done.
+	   ISO-8859-1 is a simple identity transform,
+	   so it skips steps 2-4.                     */
+
+	/* do the conversion. */
+
+	for (i = 0; i < n; i++)
+	{
+		if (src[i])
+		{
+			/* regular character. */
+			dest[i] = table[src[i]];
+		}
+		else
+		{
+			/* null character (end of string). */
+			dest[i] = L'\0';
+			return i;
+		}
+	}
+
+	return i;
+}