unicode to utf-8 encoding

Engineer
3 min readNov 5, 2022

--

유니코드에서 UTF-8 변환표

#define _CRT_SECURE_NO_WARNINGS

#include <stdio.h>

#include <stdlib.h>

#include <inttypes.h>

#include <windows.h>

typedef struct {

char mask; /* char data will be bitwise AND with this */

char lead; /* start bytes of current char in utf-8 encoded character */

uint32_t beg; /* beginning of codepoint range */

uint32_t end; /* end of codepoint range */

int bits_stored; /* the number of bits from the codepoint that fits in char */

}utf_t;

utf_t* utf[] = {

/* mask lead beg end bits */

[0] = &(utf_t) { 0b00111111, 0b10000000, 0, 0, 6 }, //utf[0]

[1] = &(utf_t) { 0b01111111, 0b00000000, 0000, 0177, 7 }, //utf[1] 1byte lead는 0 beg 8진수 시작값, end 는 8진수 최대값

[2] = &(utf_t) { 0b00011111, 0b11000000, 0200, 03777, 5 }, //utf[2] 2byte lead는 110

[3] = &(utf_t) { 0b00001111, 0b11100000, 04000, 0177777, 4 }, //utf[3] 3byte lead는 1110

[4] = &(utf_t) { 0b00000111, 0b11110000, 0200000, 04177777, 3 }, //utf[4] 4byte lead는 11110

&(utf_t) {0},

};

/* All lengths are in bytes */

int codepoint_len(const uint32_t cp); /* len of associated utf-8 char */

int utf8_len(const char ch); /* len of utf-8 encoded char */

char* to_utf8(const uint32_t cp);

uint32_t to_cp(const char chr[4]); // uint32_t 유형을 선언한다.

// 몇 byte인지 계산한다.

int codepoint_len(const uint32_t cp)

{

int len = 0;

for (utf_t** u = utf; *u; ++u) {

if ((cp >= (*u)->beg) && (cp <= (*u)->end)) {

//printf(“\n%s\n”, “====”);

//printf(“%hhx”, (*u)->beg);

//printf(“\n%s\n”, “====”);

break;

}

++len;

}

if (len > 4) /* Out of bounds */

exit(1);

return len;

}

int utf8_len(const char ch)

{

int len = 0;

for (utf_t** u = utf; *u; ++u) {

if ((ch & ~(*u)->mask) == (*u)->lead) {

break;

}

++len;

}

if (len > 4) { /* Malformed leading byte */

exit(1);

}

return len;

}

char* to_utf8(const uint32_t cp)

{

static char ret[5];

//printf(“%hhx\n”, cp);

const int bytes = codepoint_len(cp);

//printf(“%d\n”, bytes);

//00000001 01110110

//

int shift = utf[0]->bits_stored * (bytes — 1);

//printf(“%d\n”, (shift & utf[bytes]->mask));

ret[0] = (cp >> shift & utf[bytes]->mask) | utf[bytes]->lead;

shift -= utf[0]->bits_stored;

//printf(“%d\n”, shift);

for (int i = 1; i < bytes; ++i) {

ret[i] = (cp >> shift & utf[0]->mask) | utf[0]->lead;

shift -= utf[0]->bits_stored;

}

ret[bytes] = ‘\0’;

return ret;

}

int main(void)

{

const uint32_t* in, input[] = { 0x0041, 0x00f6, 0x0416, 0x20ac, 0x1d11e, 0x0 };

printf(“Character Unicode UTF-8 encoding (hex)\n”);

printf(“ — — — — — — — — — — — — — — — — — — — — \n”);

char* utf8;

uint32_t codepoint;

for (in = input; *in; ++in) {

// 16진수 출력하려면 hhx

//printf(“%hhx\n”, *in);

utf8 = to_utf8(*in);

for (int i = 0; utf8[i] && i < 4; ++i) {

printf(“%hhx “, utf8[i]);

}

printf(“\n”);

}

return 0;

}

--

--

No responses yet