diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..412d1aa
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,5 @@
+{
+	"files.associations": {
+		"*.h": "c",
+	}
+}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6f55dec
--- /dev/null
+++ b/README.md
@@ -0,0 +1,62 @@
+# unicorn
+
+unicorn is a lightweight implementation of most of the standard C wide character functions, for platforms that don't support them, but still have a wide character type in the form of `wchar_t`, **as long as it is at least 16 bits (if unsigned) or 17 bits (if signed)**.
+
+> [!NOTE]
+> this is just a hobby project.
+> as much as I try to fix issues, you should still probably not expect it to always work properly.
+> also, the code isn't exactly the most optimized. you have my warning.
+
+unlike the standard functions which are locale-dependent, unicorn does not support locales, and always uses the same text encodings:
+
+* wide characters (`wchar_t`) are assumed to be encoded in UTF-32 if `WCHAR_MAX` is at least `0x10FFFF` (e.g. Linux), or UTF-16 otherwise (e.g. Windows).
+  * surrogates (`U+D800`-`U+DFFF`) are considered invalid in UTF-32.
+  * a new function (`mbstowc`) has been implemented as an alternative to `mbtowc` to allow converting individual non-BMP characters in UTF-16.
+* multibyte strings (used in `mbstowcs` and the like) are assumed to be encoded in UTF-8.
+  * surrogates (`U+D800`-`U+DFFF`) are considered invalid in multibyte strings.
+  * characters of length 5-8 are considered invalid, and so are 4-byte characters that exceed `U+10FFFF`.
+
+> [!WARNING]
+> do not put overlong characters (characters encoded in a larger number of bytes than needed) in your multibyte strings!
+> currently, unicorn does not consider them invalid, but **this will change**.
+
+everything that unicorn implements uses the same name as its counterpart in standard C, except with a `UC_` prefix.
+the only exception being the `wchar_t` type. unicorn uses the standard `wchar_t`.
+
+## compatibility
+
+unicorn is *almost* C89-compatible, except that it needs to know the maximum possible value of the `wchar_t` type.
+if your compiling environment does not support C99 or newer, then unless your compiler itself predefines `WCHAR_MAX`, `__WCHAR_MAX`, or `__WCHAR_MAX__`, you need to manually define one of them during compile time (make sure to give it the correct value!).
+
+## what's not implemented
+
+* the following will be implemented in a later update:
+  * `wcstok` function.
+
+* the following do not need to be implemented, because UTF-8 is stateless:
+  * `mbstate_t` type.
+  * `mbsinit` function.
+  * thread-safe versions of encoding conversion functions.
+
+* the following are not planned to be implemented any time soon (or maybe ever):
+  * `wctype_t` type.
+  * character type functions (`towlower`, `towupper`, `wcscasecmp`, `wcscasecmp_l`, `wcsncasecmp`, `wcsncasecmp_l`, `wctype`, and the `isw` family, including `iswctype`).
+  * string to number conversion functions (`wcstol`, `wcstoul`, `wcstoll`, `wcstoull`, `wcstof`, `wcstod`, and `wcstold`).
+  * functions that interact with file streams (e.g. `fgetws`, `fputws`, `wprintf`).
+  * `wcscoll` and `wcscoll_l` functions.
+  * `wcsftime` function.
+  * `wcsdup` function.
+  * `wcwidth` and `wcswidth` functions.
+  * `wcsxfrm` and `wcsxfrm_l` functions.
+
+## what *is* implemented
+
+> [!IMPORTANT]
+> you need to append a `UC_` prefix to the names of these functions, types, and macros!
+
+* every `wchar.h` function not mentioned above, including a few nonstandard POSIX-only functions, like `wcpcpy`.
+* `wint_t` type (equivalent to `signed long int`), with range macros `WINT_MIN` and `WINT_MAX`.
+* `WEOF` macro (evaluates to `-1`).
+* `MB_LEN_MAX` and `MB_CUR_MAX` macros (both evaluate to `4`, because the multibyte encoding is always UTF-8).
+* wide character related `stdlib.h` functions (e.g. `wcstombs`, `mbstowcs`, `mblen`).
+* nonstandard `mbstowc` function, which is an alternative to `mbtowc`, but expects a `wchar_t*` instead of `wchar`, to be able to read surrogate pairs in UTF-16.
diff --git a/src/unicorn.c b/src/unicorn.c
new file mode 100644
index 0000000..689885b
--- /dev/null
+++ b/src/unicorn.c
@@ -0,0 +1,924 @@
+/* I wish I could use BCPL-style comments. damn you C89.. */
+
+
+#include "unicorn.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+
+#if __STDC_VERSION__ >= 199901L
+#include <stdbool.h>
+#else
+typedef int bool;
+#define false 0
+#define true  1
+#endif
+
+
+#define UC_INVALID_SIZE ((size_t)(-1))
+
+#define UC_MAX_CODEPOINT 0x10FFFF
+
+#define UC_MAX_1BYTE 0x007F
+#define UC_MAX_2BYTE 0x07FF
+#define UC_MAX_3BYTE 0xFFFF
+#define UC_MAX_4BYTE UC_MAX_CODEPOINT
+
+#define UC_SURMASK 0x03FF
+#define UC_HIGHSUR 0xD800
+#define UC_LOWSUR  0xDC00
+
+#define UC_IS_IN_RANGE(c) ((c) >= 0 && (c) <= UC_MAX_CODEPOINT)
+
+#define UC_SUR_OFFSET 0x10000
+#define UC_IS_BMP(c) ((c) <= 0xFFFF)
+
+#define UC_IS_HIGHSUR(c) ((c) >= 0xD800 && (c) < 0xDC00)
+#define UC_IS_LOWSUR(c)  ((c) >= 0xDC00 && (c) < 0xE000)
+
+#define UC_IS_SUR(c) (UC_IS_HIGHSUR(c) || UC_IS_LOWSUR(c))
+
+#define UC_TOP1 0x80
+#define UC_TOP2 0xC0
+#define UC_TOP3 0xE0
+#define UC_TOP4 0xF0
+#define UC_TOP5 0xF8
+
+#define UC_BOTTOM3 0x07
+#define UC_BOTTOM4 0x0F
+#define UC_BOTTOM5 0x1F
+#define UC_BOTTOM6 0x3F
+
+#define UC_IS_1BYTE(c) (!((c) & UC_TOP1))
+#define UC_IS_2BYTE(c) (((c) & UC_TOP3) == UC_TOP2)
+#define UC_IS_3BYTE(c) (((c) & UC_TOP4) == UC_TOP3)
+#define UC_IS_4BYTE(c) (((c) & UC_TOP5) == UC_TOP4)
+
+#define UC_IS_CONT(c)  (((c) & UC_TOP2) == UC_TOP1)
+
+
+size_t UC_wcslen(const wchar_t* s)
+{
+	size_t i;
+
+	i = 0;
+
+	while (s[i]) i++;
+
+	return i;
+}
+
+size_t UC_wcsnlen(const wchar_t* s, size_t n)
+{
+	size_t i;
+
+	i = 0;
+
+	while (s[i] && i < n) i++;
+
+	return i;
+}
+
+wchar_t* UC_wcscpy(wchar_t* dest, const wchar_t* src)
+{
+	size_t i;
+
+	for (i = 0; src[i]; i++) dest[i] = src[i];
+	dest[i] = L'\0';
+
+	return dest;
+}
+
+wchar_t* UC_wcsncpy(wchar_t* dest, const wchar_t* src, size_t n)
+{
+	size_t i;
+	bool end;
+
+	end = false;
+
+	for (i = 0; i < n; i++)
+	{
+		dest[i] = !end ? src[i] : L'\0';
+		if (!dest[i]) end = true;
+	}
+
+	return dest;
+}
+
+wchar_t* UC_wcpcpy(wchar_t* dest, const wchar_t* src)
+{
+	size_t i;
+
+	for (i = 0; src[i]; i++) dest[i] = src[i];
+	dest[i] = L'\0';
+
+	return &dest[i];
+}
+
+wchar_t* UC_wcpncpy(wchar_t* dest, const wchar_t* src, size_t n)
+{
+	size_t i,
+	       endpos;
+	bool end;
+
+	end = false;
+	endpos = n;
+
+	for (i = 0; i < n; i++)
+	{
+		dest[i] = !end ? src[i] : L'\0';
+		if (!dest[i])
+		{
+			end = true;
+			endpos = i + 1;
+		}
+	}
+
+	return &dest[endpos];
+}
+
+wchar_t* UC_wcscat(wchar_t* dest, const wchar_t* src)
+{
+	size_t i,
+	       tail;
+
+	tail = 0;
+
+	while (dest[tail]) tail++;
+
+	for (i = 0; src[i]; i++) dest[tail + i] = src[i];
+	dest[tail + i] = L'\0';
+
+	return dest;
+}
+
+wchar_t* UC_wcsncat(wchar_t* dest, const wchar_t* src, size_t n)
+{
+	size_t i,
+	       tail;
+
+	tail = 0;
+
+	while (dest[tail]) tail++;
+
+	for (i = 0; src[i] && i < n; i++) dest[tail + i] = src[i];
+	dest[tail + i] = L'\0';
+
+	return dest;
+}
+
+int UC_wcscmp(const wchar_t* s1, const wchar_t* s2)
+{
+	size_t i;
+
+	for (i = 0; s1[i] || s2[i]; i++)
+	{
+		if (s1[i] != s2[i]) return (int)(s1[i]) - (int)(s2[i]);
+	}
+
+	return 0;
+}
+
+int UC_wcsncmp(const wchar_t* s1, const wchar_t* s2, size_t n)
+{
+	size_t i;
+
+	for (i = 0; i < n && (s1[i] || s2[i]); i++)
+	{
+		if (s1[i] != s2[i]) return (int)(s1[i]) - (int)(s2[i]);
+	}
+
+	return 0;
+}
+
+wchar_t* UC_wcschr(const wchar_t* s, wchar_t c)
+{
+	size_t i;
+	wchar_t* st;
+
+	st = (wchar_t*)s;
+
+	for (i = 0; s[i]; i++)
+	{
+		if (s[i] == c) return &st[i];
+	}
+
+	return NULL;
+}
+
+wchar_t* UC_wcsrchr(const wchar_t* s, wchar_t c)
+{
+	size_t i,
+	       last;
+	wchar_t* st;
+	
+	st = (wchar_t*)s;
+	last = UC_INVALID_SIZE;
+
+	for (i = 0; s[i]; i++)
+	{
+		if (s[i] == c) last = i;
+	}
+
+	return (last != UC_INVALID_SIZE) ? &st[last] : NULL;
+}
+
+wchar_t* UC_wcsstr(const wchar_t* s, const wchar_t* kernel)
+{
+	/* could probably use some optimization */
+
+	size_t i,
+	       klen;
+	wchar_t* st;
+
+	st = (wchar_t*)s;
+
+	if (!*kernel) return st;
+
+	klen = UC_wcslen(kernel);
+
+	for (i = 0; s[i] && s[i + klen - 1]; i++)
+	{
+		if (!UC_wcsncmp(&s[i], kernel, klen)) return &st[i];
+	}
+
+	return NULL;
+}
+
+size_t UC_wcsspn(const wchar_t* s, const wchar_t* accept)
+{
+	size_t i;
+
+	for (i = 0; s[i]; i++)
+	{
+		if (!UC_wcschr(accept, s[i])) return i;
+	}
+
+	return i;
+}
+
+size_t UC_wcscspn(const wchar_t* s, const wchar_t* reject)
+{
+	size_t i;
+
+	for (i = 0; s[i]; i++)
+	{
+		if (UC_wcschr(reject, s[i])) return i;
+	}
+
+	return i;
+}
+
+wchar_t* UC_wcspbrk(const wchar_t* s, const wchar_t* accept)
+{
+	size_t i;
+	wchar_t* st;
+
+	st = (wchar_t*)s;
+
+	for (i = 0; s[i]; i++)
+	{
+		if (UC_wcschr(accept, s[i])) return &st[i];
+	}
+
+	return NULL;
+}
+
+/*
+wchar_t* UC_wcstok(wchar_t* s, const wchar_t* delim, wchar_t** p)
+{
+}
+*/
+
+wchar_t* UC_wmemset(wchar_t* s, wchar_t c, size_t n)
+{
+	size_t i;
+
+	for (i = 0; i < n; i++) s[i] = c;
+
+	return s;
+}
+
+wchar_t* UC_wmemcpy(wchar_t* dest, const wchar_t* src, size_t n)
+{
+	return memcpy(dest, src, n * sizeof(wchar_t));
+}
+
+wchar_t* UC_wmemmove(wchar_t* dest, const wchar_t* src, size_t n)
+{
+	return memmove(dest, src, n * sizeof(wchar_t));
+}
+
+int UC_wmemcmp(const wchar_t* s1, const wchar_t* s2, size_t n)
+{
+	size_t i;
+
+	for (i = 0; i < n; i++)
+	{
+		if (s1[i] != s2[i]) return (int)(s1[i]) - (int)(s2[i]);
+	}
+
+	return 0;
+}
+
+wchar_t* UC_wmemchr(const wchar_t* s, wchar_t c, size_t n)
+{
+	size_t i;
+	wchar_t* st;
+
+	st = (wchar_t*)s;
+
+	for (i = 0; i < n; i++)
+	{
+		if (s[i] == c) return &st[i];
+	}
+
+	return NULL;
+}
+
+size_t UC_wcstombs(char* dest, const wchar_t* src, size_t n)
+{
+	size_t isrc,
+	       idest,
+	       trail;
+	unsigned long int csrc;
+	
+	isrc = 0;
+	trail = 0;
+
+	for (idest = 0; idest < n || !dest; idest++)
+	{
+		if (!trail)
+		{
+			/* start of a character. */
+
+			if (!UC_IS_IN_RANGE(src[isrc]))
+			{
+				/* codepoint out of valid range. */
+				return -1;
+			}
+#if UC_UTF16 /* UTF-16 */
+			if (!UC_IS_SUR(src[isrc]))
+			{
+				/* regular character. */
+				csrc = src[isrc++];
+			}
+			else if
+			(
+				UC_IS_HIGHSUR(src[isrc]) &&
+				UC_IS_LOWSUR(src[isrc + 1])
+			)
+			{
+				/* surrogate pair. */
+				csrc = 0;
+				csrc |= (src[isrc++] & UC_SURMASK) << 10;
+				csrc |= (src[isrc++] & UC_SURMASK) << 0;
+				csrc += UC_SUR_OFFSET;
+			}
+			else
+			{
+				/* invalid surrogate sequence. */
+				return -1;
+			}
+#else        /* UTF-32 */
+			if (!UC_IS_SUR(src[isrc]))
+			{
+				csrc = src[isrc++];
+			}
+			else
+			{
+				/* UTF-16 surrogate (invalid in UTF-32). */
+				return -1;
+			}
+#endif
+			if (!csrc)
+			{
+				/* null character (end of string). */
+				if (dest) dest[idest] = '\0';
+				return idest;
+			}
+			else if (csrc <= UC_MAX_1BYTE)
+			{
+				/* 1-byte (ASCII) character. */
+				trail = 0;
+				if (dest) dest[idest] = csrc;
+			}
+			else if (csrc <= UC_MAX_2BYTE)
+			{
+				/* first byte of a 2-byte character. */
+				trail = 1;
+				if (dest && idest + trail >= n) return idest;
+				if (dest) dest[idest] = (csrc >> 6) | UC_TOP2;
+			}
+			else if (csrc <= UC_MAX_3BYTE)
+			{
+				/* first byte of a 3-byte character. */
+				trail = 2;
+				if (dest && idest + trail >= n) return idest;
+				if (dest) dest[idest] = (csrc >> 12) | UC_TOP3;
+			}
+			else
+			{
+				/* first byte of a 4-byte (non-BMP) character.
+				   in UTF-16, this case will only occur with
+				   surrogate pairs.                           */
+				trail = 3;
+				if (dest && idest + trail >= n) return idest;
+				if (dest) dest[idest] = (csrc >> 18) | UC_TOP4;
+			}
+		}
+		else
+		{
+			/* continuation of a 2/3/4-byte character. */
+			trail--;
+			if (dest) dest[idest] =
+			((csrc >> (6 * trail)) & UC_BOTTOM6) | UC_TOP1;
+		}
+	}
+
+	return idest;
+}
+
+size_t UC_mbstowcs(wchar_t* dest, const char* src, size_t n)
+{
+	size_t isrc,
+	       idest,
+	       trail;
+	bool lowsur;
+	unsigned long int csrc;
+
+	isrc = 0;
+	lowsur = false; /* always false in UTF-32. */
+
+	for (idest = 0; idest < n || !dest; idest++)
+	{
+		if (!lowsur)
+		{
+			/* regular character or high surrogate.
+			   surrogates are only valid in UTF-16. */
+
+			/* read the first byte of the character. */
+
+			if (UC_IS_1BYTE(src[isrc]))
+			{
+				/* 1-byte (ASCII) character. */
+				trail = 0;
+				csrc = src[isrc++];
+			}
+			else if (UC_IS_2BYTE(src[isrc]))
+			{
+				/* first byte of a 2-byte character. */
+				trail = 1;
+				csrc = (src[isrc++] & UC_BOTTOM5) << 6;
+			}
+			else if (UC_IS_3BYTE(src[isrc]))
+			{
+				/* first byte of a 3-byte character.
+				   this might be a UTF-16 surrogate,
+				   which is invalid in UTF-8.        */
+				trail = 2;
+				csrc = (src[isrc++] & UC_BOTTOM4) << 12;
+			}
+			else if (UC_IS_4BYTE(src[isrc]))
+			{
+				/* first byte of a 4-byte (non-BMP) character.
+				   this will require checking later:
+				   if the codepoint is 0x10FFFF or lower,
+				   it requires a surrogate pair in UTF-16.
+				   otherwise, it is invalid entirely.         */
+				trail = 3;
+				csrc = (src[isrc++] & UC_BOTTOM3) << 18;
+			}
+			else
+			{
+				/* unexpected byte, which is either:
+				   * continuation byte
+				     (cannot be at the start of a character),
+				   or
+				   * first byte of a 5/6/7/8-byte character
+				     (nonstandard).                           */
+				return -1;
+			}
+
+			/* read the remaining bytes. */
+
+			while (trail)
+			{
+				if (!UC_IS_CONT(src[isrc]))
+				{
+					/* unexpected byte
+					   (expected a continuation byte). */
+					return -1;
+				}
+
+				csrc |=
+				(src[isrc++] & UC_BOTTOM6) << (6 * (--trail));
+			}
+
+			if (UC_IS_SUR(csrc))
+			{
+				/* UTF-16 surrogate (invalid in UTF-8). */
+				return -1;
+			}
+			else if (!UC_IS_IN_RANGE(csrc))
+			{
+				/* codepoint out of valid range. */
+				return -1;
+			}
+
+			if (!csrc)
+			{
+				/* null character (end of string). */
+				if (dest) dest[idest] = L'\0';
+				return idest;
+			}
+#if UC_UTF16 /* UTF-16 */
+			if (UC_IS_BMP(csrc))
+			{
+				/* BMP character.
+				   can be written normally. */
+				if (dest) dest[idest] = csrc;
+			}
+			else
+			{
+				/* non-BMP character.
+				   requires a surrogate pair. */
+
+				/* write the high surrogate now. */
+				if (dest) dest[idest] =
+				((csrc - UC_SUR_OFFSET) >> 10) | UC_HIGHSUR;
+
+				/* write the low surrogate on the next turn. */
+				lowsur = true;
+			}
+#else        /* UTF-32 */
+			if (dest) dest[idest] = csrc;
+#endif
+		}
+		else
+		{
+			/* this block of code can only be accessed in UTF-16. */
+
+			/* low surrogate. */
+			if (dest) dest[idest] =
+			((csrc - UC_SUR_OFFSET) & UC_SURMASK) | UC_LOWSUR;
+
+			/* reset the behaviour for the next turn. */
+			lowsur = false;
+		}
+	}
+
+	return idest;
+}
+
+static int UC_wctomb_internal(char* s, unsigned long int c)
+{
+	size_t i,
+	       trail;
+
+	i = 0;
+
+	if (c <= UC_MAX_1BYTE)
+	{
+		/* 1-byte (ASCII) character. */
+		trail = 0;
+		s[i++] = c;
+	}
+	else if (c <= UC_MAX_2BYTE)
+	{
+		/* first byte of a 2-byte character. */
+		trail = 1;
+		s[i++] = (c >> 6) | UC_TOP2;
+	}
+	else if (c <= UC_MAX_3BYTE)
+	{
+		/* first byte of a 3-byte character. */
+		trail = 2;
+		s[i++] = (c >> 12) | UC_TOP3;
+	}
+	else
+	{
+		/* first byte of a 4-byte (non-BMP) character.
+		   this block of code is only accessible in UTF-32,
+		   since a non-BMP character cannot be represented
+		   by a single wchar_t in UTF-16.                   */
+		trail = 3;
+		s[i++] = (c >> 18) | UC_TOP4;
+	}
+
+	while (trail)
+	{
+		s[i++] = ((c >> (6 * (--trail))) & UC_BOTTOM6) | UC_TOP1;
+	}
+
+	return i;
+}
+
+int UC_wctomb(char* s, wchar_t c)
+{
+	if (!s)
+	{
+		/* shift state reset has been requested.
+		   no further action is needed other than returning 0,
+		   because UTF-8 is stateless.                         */
+		return 0;
+	}
+
+	if (!UC_IS_IN_RANGE(c))
+	{
+		/* codepoint out of valid range. */
+		return -1;
+	}
+	if (UC_IS_SUR(c))
+	{
+		/* UTF-16 surrogate
+		   (meaningless as a single character, even in UTF-16). */
+		return -1;
+	}
+
+	return UC_wctomb_internal(s, c);
+}
+
+int UC_wcstomb(char* s, const wchar_t* pc)
+{
+	unsigned long int c;
+
+	if (!s)
+	{
+		/* shift state reset has been requested.
+		   no further action is needed other than returning 0,
+		   because UTF-8 is stateless.                         */
+		return 0;
+	}
+
+	if (!UC_IS_IN_RANGE(*pc))
+	{
+		/* codepoint out of valid range. */
+		return -1;
+	}
+#if UC_UTF16 /* UTF-16 */
+	if (!UC_IS_SUR(*pc))
+	{
+		/* regular character. */
+		c = *pc;
+	}
+	else if (UC_IS_HIGHSUR(pc[0]) && UC_IS_LOWSUR(pc[1]))
+	{
+		/* surrogate pair. */
+		c = 0;
+		c |= (pc[0] & UC_SURMASK) << 10;
+		c |= (pc[1] & UC_SURMASK) << 0;
+		c += UC_SUR_OFFSET;
+	}
+	else
+	{
+		/* invalid surrogate sequence. */
+		return -1;
+	}
+#else        /* UTF-32 */
+	if (!UC_IS_SUR(*pc))
+	{
+		c = *pc;
+	}
+	else
+	{
+		/* UTF-16 surrogate (invalid in UTF-32). */
+		return -1;
+	}
+#endif
+
+	return UC_wctomb_internal(s, c);
+}
+
+int UC_mbtowc(wchar_t* pc, const char* s, size_t n)
+{
+	size_t i,
+	       trail;
+	unsigned long int c;
+
+	if (!s)
+	{
+		/* shift state reset has been requested.
+		   no further action is needed other than returning 0,
+		   because UTF-8 is stateless.                         */
+		return 0;
+	}
+
+	trail = UC_INVALID_SIZE;
+
+	for (i = 0; i < n || !trail; i++)
+	{
+		if (trail == UC_INVALID_SIZE)
+		{
+			/* read the first byte of the character. */
+
+			if (!s[i])
+			{
+				/* null character. */
+				return 0;
+			}
+			else if (UC_IS_1BYTE(s[i]))
+			{
+				/* 1-byte (ASCII) character. */
+				trail = 0;
+				c = s[i];
+			}
+			else if (UC_IS_2BYTE(s[i]))
+			{
+				/* first byte of a 2-byte character. */
+				trail = 1;
+				c = (s[i] & UC_BOTTOM5) << 6;
+			}
+			else if (UC_IS_3BYTE(s[i]))
+			{
+				/* first byte of a 3-byte character.
+				   this might be a UTF-16 surrogate,
+				   which is invalid in UTF-8.        */
+				trail = 2;
+				c = (s[i] & UC_BOTTOM4) << 12;
+			}
+			else if (UC_IS_4BYTE(s[i]))
+			{
+				/* first byte of a 4-byte (non-BMP) character.
+				   this will require checking later:
+				   if the codepoint is 0x10FFFF or lower,
+				   it requires a surrogate pair in UTF-16.
+				   otherwise, it is invalid entirely.         */
+				trail = 3;
+				c = (s[i] & UC_BOTTOM3) << 18;
+			}
+			else
+			{
+				/* unexpected byte, which is either:
+				   * continuation byte
+				     (cannot be at the start of a character),
+				   or
+				   * first byte of a 5/6/7/8-byte character
+				     (nonstandard).                           */
+				return -1;
+			}
+		}
+		else if (!trail)
+		{
+			/* the character has been fully parsed. */
+
+			if (UC_IS_SUR(c))
+			{
+				/* UTF-16 surrogate (invalid in UTF-8). */
+				return -1;
+			}
+			else if (!UC_IS_IN_RANGE(c))
+			{
+				/* codepoint out of valid range. */
+				return -1;
+			}
+#if UC_UTF16 /* UTF-16 */
+			if (UC_IS_BMP(c))
+			{
+				/* BMP character.
+				   can be written normally. */
+				if (pc) *pc = c;
+			}
+			else
+			{
+				/* non-BMP character.
+				   requires a surrogate pair. */
+				if (pc)
+				{
+					pc[0] =
+					((c - UC_SUR_OFFSET) >> 10) |
+					UC_HIGHSUR;
+
+					pc[1] =
+					((c - UC_SUR_OFFSET) & UC_SURMASK) |
+					UC_LOWSUR;
+				}
+			}
+#else        /* UTF-32 */
+			if (pc) *pc = c;
+#endif
+			break;
+		}
+		else
+		{
+			/* parsing the continuation of a character. */
+
+			if (!UC_IS_CONT(s[i]))
+			{
+				/* unexpected byte
+				   (expected a continuation byte). */
+				return -1;
+			}
+
+			c |= (s[i] & UC_BOTTOM6) << (6 * (--trail));
+		}
+	}
+
+	return !trail ? i : -1;
+}
+
+int UC_wctob(UC_wint_t c)
+{
+	return c >= 0x00 & c <= 0x7F ? c : EOF;
+}
+
+UC_wint_t UC_btowc(int c)
+{
+	return c >= 0x00 & c <= 0x7F ? c : UC_WEOF;
+}
+
+int UC_mblen(const char* s, size_t n)
+{
+	size_t i,
+	       trail;
+	unsigned long int c;
+
+	trail = UC_INVALID_SIZE;
+
+	for (i = 0; i < n || !trail; i++)
+	{
+		if (trail == UC_INVALID_SIZE)
+		{
+			/* read the first byte of the character. */
+
+			if (!s[i])
+			{
+				/* null character. */
+				return 0;
+			}
+			else if (UC_IS_1BYTE(s[i]))
+			{
+				/* 1-byte (ASCII) character. */
+				trail = 0;
+				c = s[i];
+			}
+			else if (UC_IS_2BYTE(s[i]))
+			{
+				/* first byte of a 2-byte character. */
+				trail = 1;
+				c = (s[i] & UC_BOTTOM5) << 6;
+			}
+			else if (UC_IS_3BYTE(s[i]))
+			{
+				/* first byte of a 3-byte character.
+				   this might be a UTF-16 surrogate,
+				   which is invalid in UTF-8.        */
+				trail = 2;
+				c = (s[i] & UC_BOTTOM4) << 12;
+			}
+			else if (UC_IS_4BYTE(s[i]))
+			{
+				/* first byte of a 4-byte (non-BMP) character.
+				   this will require checking later:
+				   if the codepoint is 0x10FFFF or lower,
+				   it requires a surrogate pair in UTF-16.
+				   otherwise, it is invalid entirely.         */
+				trail = 3;
+				c = (s[i] & UC_BOTTOM3) << 18;
+			}
+			else
+			{
+				/* unexpected byte, which is either:
+				   * continuation byte
+				     (cannot be at the start of a character),
+				   or
+				   * first byte of a 5/6/7/8-byte character
+				     (nonstandard).                           */
+				return -1;
+			}
+		}
+		else if (!trail)
+		{
+			/* the character has been fully parsed. */
+
+			if (UC_IS_SUR(c))
+			{
+				/* UTF-16 surrogate (invalid in UTF-8). */
+				return -1;
+			}
+			else if (!UC_IS_IN_RANGE(c))
+			{
+				/* codepoint out of valid range. */
+				return -1;
+			}
+
+			break;
+		}
+		else
+		{
+			/* parsing the continuation of a character. */
+
+			if (!UC_IS_CONT(s[i]))
+			{
+				/* unexpected byte
+				   (expected a continuation byte). */
+				return -1;
+			}
+
+			c |= (s[i] & UC_BOTTOM6) << (6 * (--trail));
+		}
+	}
+
+	return !trail ? i : -1;
+}
diff --git a/src/unicorn.h b/src/unicorn.h
new file mode 100644
index 0000000..5f1d7a9
--- /dev/null
+++ b/src/unicorn.h
@@ -0,0 +1,95 @@
+#ifndef UC_H_UNICORN
+#define UC_H_UNICORN
+
+
+#include <stddef.h>
+#include <limits.h>
+
+#if __STDC_VERSION__ >= 199901L
+#include <stdint.h>
+#endif
+
+
+#if !defined(WCHAR_MAX)
+#if defined(__WCHAR_MAX)
+#define WCHAR_MAX __WCHAR_MAX
+#elif defined(__WCHAR_MAX__)
+#define WCHAR_MAX __WCHAR_MAX__
+#else
+#error "UNICORN: could not detect the size of a wide character."
+#endif
+#endif
+
+#if WCHAR_MAX < 0xFFFF
+#error "UNICORN: a wide character must have a maximum value of at least 0xFFFF."
+#elif WCHAR_MAX < 0x10FFFF
+#define UC_UTF16 1 /* UTF-16 */
+#else
+#define UC_UTF16 0 /* UTF-32 */
+#endif
+
+
+typedef signed long int UC_wint_t;
+
+#define UC_WINT_MAX LONG_MAX
+#define UC_WINT_MIN LONG_MIN
+
+#define UC_WEOF (-1)
+
+#define UC_MB_LEN_MAX 4
+#define UC_MB_CUR_MAX UC_MB_LEN_MAX
+
+
+size_t UC_wcslen(const wchar_t* s);
+size_t UC_wcsnlen(const wchar_t* s, size_t n);
+
+wchar_t* UC_wcscpy(wchar_t* dest, const wchar_t* src);
+wchar_t* UC_wcsncpy(wchar_t* dest, const wchar_t* src, size_t n);
+wchar_t* UC_wcpcpy(wchar_t* dest, const wchar_t* src);
+wchar_t* UC_wcpncpy(wchar_t* dest, const wchar_t* src, size_t n);
+
+wchar_t* UC_wcscat(wchar_t* dest, const wchar_t* src);
+wchar_t* UC_wcsncat(wchar_t* dest, const wchar_t* src, size_t n);
+
+int UC_wcscmp(const wchar_t* s1, const wchar_t* s2);
+int UC_wcsncmp(const wchar_t* s1, const wchar_t* s2, size_t n);
+
+wchar_t* UC_wcschr(const wchar_t* s, wchar_t c);
+wchar_t* UC_wcsrchr(const wchar_t* s, wchar_t c);
+
+wchar_t* UC_wcsstr(const wchar_t* s, const wchar_t* kernel);
+
+size_t UC_wcsspn(const wchar_t* s, const wchar_t* accept);
+size_t UC_wcscspn(const wchar_t* s, const wchar_t* reject);
+
+wchar_t* UC_wcspbrk(const wchar_t* s, const wchar_t* accept);
+
+/*
+wchar_t* UC_wcstok(wchar_t* s, const wchar_t* delim, wchar_t** p);
+*/
+
+wchar_t* UC_wmemset(wchar_t* s, wchar_t c, size_t n);
+
+wchar_t* UC_wmemcpy(wchar_t* dest, const wchar_t* src, size_t n);
+wchar_t* UC_wmemmove(wchar_t* dest, const wchar_t* src, size_t n);
+
+int UC_wmemcmp(const wchar_t* s1, const wchar_t* s2, size_t n);
+
+wchar_t* UC_wmemchr(const wchar_t* s, wchar_t c, size_t n);
+
+size_t UC_wcstombs(char* dest, const wchar_t* src, size_t n);
+size_t UC_mbstowcs(wchar_t* dest, const char* src, size_t n);
+
+int UC_wctomb(char* s, wchar_t c);
+int UC_wcstomb(char* s, const wchar_t* pc);
+int UC_mbtowc(wchar_t* pc, const char* s, size_t n);
+
+/* in UTF-8, these two functions are completely useless and extremely trivial.
+   but whatever, I've still implemented them anyway.                          */
+int UC_wctob(UC_wint_t c);
+UC_wint_t UC_btowc(int c);
+
+int UC_mblen(const char* s, size_t n);
+
+
+#endif /* UC_H_UNICORN */