terminal: Make utf-8 state machine assemble unicode code point value
diff --git a/clients/terminal.c b/clients/terminal.c
index 31bcedd..6701fb6 100644
--- a/clients/terminal.c
+++ b/clients/terminal.c
@@ -106,6 +106,7 @@
enum utf8_state state;
int len;
union utf8_char s;
+ uint32_t unicode;
};
static void
@@ -132,6 +133,7 @@
/* single byte, accept */
machine->s.byte[machine->len++] = c;
machine->state = utf8state_accept;
+ machine->unicode = c;
} else if((c & 0xC0) == 0x80) {
/* parser out of sync, ignore byte */
machine->state = utf8state_start;
@@ -139,14 +141,17 @@
/* start of two byte sequence */
machine->s.byte[machine->len++] = c;
machine->state = utf8state_expect1;
+ machine->unicode = c & 0x1f;
} else if((c & 0xF0) == 0xE0) {
/* start of three byte sequence */
machine->s.byte[machine->len++] = c;
machine->state = utf8state_expect2;
+ machine->unicode = c & 0x0f;
} else if((c & 0xF8) == 0xF0) {
/* start of four byte sequence */
machine->s.byte[machine->len++] = c;
machine->state = utf8state_expect3;
+ machine->unicode = c & 0x07;
} else {
/* overlong encoding, reject */
machine->state = utf8state_reject;
@@ -154,6 +159,7 @@
break;
case utf8state_expect3:
machine->s.byte[machine->len++] = c;
+ machine->unicode = (machine->unicode << 6) | (c & 0x3f);
if((c & 0xC0) == 0x80) {
/* all good, continue */
machine->state = utf8state_expect2;
@@ -164,6 +170,7 @@
break;
case utf8state_expect2:
machine->s.byte[machine->len++] = c;
+ machine->unicode = (machine->unicode << 6) | (c & 0x3f);
if((c & 0xC0) == 0x80) {
/* all good, continue */
machine->state = utf8state_expect1;
@@ -174,6 +181,7 @@
break;
case utf8state_expect1:
machine->s.byte[machine->len++] = c;
+ machine->unicode = (machine->unicode << 6) | (c & 0x3f);
if((c & 0xC0) == 0x80) {
/* all good, accept */
machine->state = utf8state_accept;
@@ -190,6 +198,26 @@
return machine->state;
}
+static uint32_t
+get_unicode(union utf8_char utf8)
+{
+ struct utf8_state_machine machine;
+ int i;
+
+ init_state_machine(&machine);
+ for (i = 0; i < 4; i++) {
+ utf8_next_char(&machine, utf8.byte[i]);
+ if (machine.state == utf8state_accept ||
+ machine.state == utf8state_reject)
+ break;
+ }
+
+ if (machine.state == utf8state_reject)
+ return 0xfffd;
+
+ return machine.unicode;
+}
+
struct char_sub {
union utf8_char match;
union utf8_char replace;